From f6faac71d502be1c29c81b2f45657662c3b84470 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2009 17:24:40 -0700 Subject: rcu: Mark Hierarchical RCU no longer experimental Removes the warnings about Hierarchical RCU being experimental, given that it has gone through almost six months of being the default RCU in mainline for the x86 with very little trouble. This makes hierarchical-RCU bootup look less scary. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: dipankar@in.ibm.com Cc: dhowells@redhat.com Cc: lethal@linux-sh.org Cc: kernel@wantstofly.org Cc: cl@linux-foundation.org Cc: schamp@sgi.com Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0dccfbb..7717b95 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1533,7 +1533,7 @@ void __init __rcu_init(void) int j; struct rcu_node *rnp; - printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); + printk(KERN_INFO "Hierarchical RCU implementation.\n"); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ @@ -1546,7 +1546,6 @@ void __init __rcu_init(void) rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); /* Register notifier for non-boot CPUs */ register_cpu_notifier(&rcu_nb); - printk(KERN_WARNING "Experimental hierarchical RCU init done.\n"); } module_param(blimit, int, 0); -- cgit v1.1 From a32c7765e2796395aec49f699bd25c407155e9c5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 26 Jun 2009 16:55:51 +0800 Subject: tracing: Fix stack tracer sysctl handling This made my machine completely frozen: # echo 1 > /proc/sys/kernel/stack_tracer_enabled # echo 2 > /proc/sys/kernel/stack_tracer_enabled The cause is register_ftrace_function() was called twice. Also fix ftrace_enabled sysctl, though seems nothing bad happened as I tested it. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A448D17.9010305@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace_stack.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3716bf..bce9e01 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3160,10 +3160,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ret = proc_dointvec(table, write, file, buffer, lenp, ppos); - if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = ftrace_enabled; + last_ftrace_enabled = !!ftrace_enabled; if (ftrace_enabled) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2d7aebd..e644af9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -326,10 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write, ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || - (last_stack_tracer_enabled == stack_tracer_enabled)) + (last_stack_tracer_enabled == !!stack_tracer_enabled)) goto out; - last_stack_tracer_enabled = stack_tracer_enabled; + last_stack_tracer_enabled = !!stack_tracer_enabled; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.1 From 82d5308127c3e3404ffbf41e503853c68660b18b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 26 Jun 2009 17:07:02 +0800 Subject: trace_export: Repair missed fields Some fields for struct ftrace_graph_ret are missed when they are exported to user. Signed-off-by: Lai Jiangshan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A448FB6.5000302@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5e32e37..6db005e 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, ftrace_graph_ret_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ret.func, func) + TRACE_FIELD(unsigned long long, ret.calltime, calltime) + TRACE_FIELD(unsigned long long, ret.rettime, rettime) + TRACE_FIELD(unsigned long, ret.overrun, overrun) TRACE_FIELD(int, ret.depth, depth) ), TP_RAW_FMT("<-- %lx (%d)") -- cgit v1.1 From 238a24f626628cb16a3015f332d649f08246ca89 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 29 Jun 2009 15:55:10 +0800 Subject: tracing/fastboot: Document the need of initcall_debug To use boot tracer, one should pass initcall_debug as well as ftrace=initcall to the command line. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A48735E.9050002@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1551f47..019f380 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -226,13 +226,13 @@ config BOOT_TRACER the timings of the initcalls and traces key events and the identity of tasks that can cause boot delays, such as context-switches. - Its aim is to be parsed by the /scripts/bootgraph.pl tool to + Its aim is to be parsed by the scripts/bootgraph.pl tool to produce pretty graphics about boot inefficiencies, giving a visual representation of the delays during initcalls - but the raw /debug/tracing/trace text output is readable too. - You must pass in ftrace=initcall to the kernel command line - to enable this on bootup. + You must pass in initcall_debug and ftrace=initcall to the kernel + command line to enable this on bootup. config TRACE_BRANCH_PROFILING bool -- cgit v1.1 From 4a2bb6fcc80e6330ca2f2393e98605052cc7780b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 30 Jun 2009 17:08:09 -0400 Subject: kprobes: No need to unlock kprobe_insn_mutex Remove needless kprobe_insn_mutex unlocking during safety check in garbage collection, because if someone releases a dirty slot during safety check (which ensures other cpus doesn't execute all dirty slots), the safety check must be fail. So, we need to hold the mutex while checking safety. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli LKML-Reference: <20090630210809.17851.28781.stgit@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c0fa54b..16b5739 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip; struct hlist_node *pos, *next; - int safety; /* Ensure no-one is preepmted on the garbages */ - mutex_unlock(&kprobe_insn_mutex); - safety = check_safety(); - mutex_lock(&kprobe_insn_mutex); - if (safety != 0) + if (check_safety()) return -EAGAIN; hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { -- cgit v1.1 From e1af3aec3e2e7d21d4b3054323779d478c19d907 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 1 Jul 2009 16:50:25 +0800 Subject: tracing: Fix trace_print_seq() We will lose something if trace_seq->buffer[0] is 0, because the copy length is calculated by strlen() in seq_puts(), so using seq_write() instead of seq_puts(). There have a example: after reboot: # echo kmemtrace > current_tracer # echo 0 > options/kmem_minimalistic # cat trace # tracer: kmemtrace # # Nothing is exported, because the first byte of trace_seq->buffer[ ] is KMEMTRACE_USER_ALLOC. ( the value of KMEMTRACE_USER_ALLOC is zero, seeing kmemtrace_print_alloc_user() in kernel/trace/kmemtrace.c) Signed-off-by: Xiao Guangrong Acked-by: Frederic Weisbecker Acked-by: Pekka Enberg Acked-by: Eduard - Gabriel Munteanu Cc: Steven Rostedt LKML-Reference: <4A4B2351.5010300@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7938f3a..e0c2545 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - s->buffer[len] = 0; - seq_puts(m, s->buffer); + seq_write(m, s->buffer, len); trace_seq_init(s); } -- cgit v1.1 From 5bfd7560979062ad75c9805c1719cec990b5db29 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 5 Jul 2009 12:08:19 -0700 Subject: Fix virt_to_phys() warnings These warnings were observed on MIPS32 using 2.6.31-rc1 and gcc-4.2.0: mm/page_alloc.c: In function 'alloc_pages_exact': mm/page_alloc.c:1986: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast drivers/usb/mon/mon_bin.c: In function 'mon_alloc_buff': drivers/usb/mon/mon_bin.c:1264: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast [akpm@linux-foundation.org: fix kernel/perf_counter.c too] Signed-off-by: Kevin Cernekee Cc: Andi Kleen Cc: Ralf Baechle Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50d..a641eb7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2020,7 +2020,7 @@ fail: static void perf_mmap_free_page(unsigned long addr) { - struct page *page = virt_to_page(addr); + struct page *page = virt_to_page((void *)addr); page->mapping = NULL; __free_page(page); -- cgit v1.1 From 793285fcafce4719a05e0c99fa74b188157fe7fe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 5 Jul 2009 12:08:26 -0700 Subject: cred_guard_mutex: do not return -EINTR to user-space do_execve() and ptrace_attach() return -EINTR if mutex_lock_interruptible(->cred_guard_mutex) fails. This is not right, change the code to return ERESTARTNOINTR. Perhaps we should also change proc_pid_attr_write(). Signed-off-by: Oleg Nesterov Cc: David Howells Acked-by: Roland McGrath Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61c78b2..082c320 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task) * interference; SUID, SGID and LSM creds get determined differently * under ptrace. */ - retval = mutex_lock_interruptible(&task->cred_guard_mutex); - if (retval < 0) + retval = -ERESTARTNOINTR; + if (mutex_lock_interruptible(&task->cred_guard_mutex)) goto out; task_lock(task); -- cgit v1.1 From b43f3cbd21ffbd719fd4fa6642bfe6af255ded34 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Jul 2009 01:54:37 +0400 Subject: headers: mnt_namespace.h redux Fix various silly problems wrt mnt_namespace.h: - exit_mnt_ns() isn't used, remove it - done that, sched.h and nsproxy.h inclusions aren't needed - mount.h inclusion was need for vfsmount_lock, but no longer - remove mnt_namespace.h inclusion from files which don't use anything from mnt_namespace.h Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/kmod.c | 1 - 3 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 628d41f..869dc22 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 467746b..bd29592 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/kmod.c b/kernel/kmod.c index 7e95bed..385c31a 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include -- cgit v1.1 From ad361c9884e809340f6daca80d56a9e9c871690a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 6 Jul 2009 13:05:40 -0700 Subject: Remove multiple KERN_ prefixes from printk formats Commit 5fd29d6ccbc98884569d6f3105aeca70858b3e0f ("printk: clean up handling of log-levels and newlines") changed printk semantics. printk lines with multiple KERN_ prefixes are no longer emitted as before the patch. is now included in the output on each additional use. Remove all uses of multiple KERN_s in formats. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- kernel/module.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 38928fc..0a04983 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2451,9 +2451,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return ret; } if (ret > 0) { - printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " - "it should follow 0/-E convention\n" - KERN_WARNING "%s: loading module anyway...\n", + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", __func__, mod->name, ret, __func__); dump_stack(); -- cgit v1.1 From 264ef8a904943ed7d0b04fa958894d7a5c2b2c61 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Jul 2009 10:33:01 +0100 Subject: kmemleak: Remove alloc_bootmem annotations introduced in the past kmemleak_alloc() calls were added in some places where alloc_bootmem was called. Since now kmemleak tracks bootmem allocations, these explicit calls should be run. Signed-off-by: Catalin Marinas Cc: Ingo Molnar Acked-by: Pekka Enberg --- kernel/pid.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 5fa1db4..31310b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,7 +36,6 @@ #include #include #include -#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -513,12 +512,6 @@ void __init pidhash_init(void) pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); if (!pid_hash) panic("Could not alloc pidhash!\n"); - /* - * pid_hash contains references to allocated struct pid objects and it - * must be scanned by kmemleak to avoid false positives. - */ - kmemleak_alloc(pid_hash, pidhash_size * sizeof(*(pid_hash)), 0, - GFP_KERNEL); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } -- cgit v1.1 From 71a851b4d2a815adcfac09c1adda7ef6811fde66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 09:06:56 +0200 Subject: perf_counter: Stop open coding unclone_ctx Instead of open coding the unclone context thingy, put it in a common function. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50d..8bf997d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -146,6 +146,14 @@ static void put_ctx(struct perf_counter_context *ctx) } } +static void unclone_ctx(struct perf_counter_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1463,10 +1471,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task) /* * Unclone this context if we enabled any counter. */ - if (enabled && ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } + if (enabled) + unclone_ctx(ctx); spin_unlock(&ctx->lock); @@ -1526,7 +1532,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx, static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { - struct perf_counter_context *parent_ctx; struct perf_counter_context *ctx; struct perf_cpu_context *cpuctx; struct task_struct *task; @@ -1586,11 +1591,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { - parent_ctx = ctx->parent_ctx; - if (parent_ctx) { - put_ctx(parent_ctx); - ctx->parent_ctx = NULL; /* no longer a clone */ - } + unclone_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4255,15 +4256,12 @@ void perf_counter_exit_task(struct task_struct *child) */ spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; - if (child_ctx->parent_ctx) { - /* - * This context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - put_ctx(child_ctx->parent_ctx); - child_ctx->parent_ctx = NULL; - } + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the counters from it. + */ + unclone_ctx(child_ctx); spin_unlock(&child_ctx->lock); local_irq_restore(flags); -- cgit v1.1 From a1ba4d8ba9f06a397e97cbd67a93ee306860b40a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2009 18:40:15 +0200 Subject: sched_rt: Fix overload bug on rt group scheduling Fixes an easily triggerable BUG() when setting process affinities. Make sure to count the number of migratable tasks in the same place: the root rt_rq. Otherwise the number doesn't make sense and we'll hit the BUG in set_cpus_allowed_rt(). Also, make sure we only count tasks, not groups (this is probably already taken care of by the fact that rt_se->nr_cpus_allowed will be 0 for groups, but be more explicit) Tested-by: Thomas Gleixner CC: stable@kernel.org Signed-off-by: Peter Zijlstra Acked-by: Gregory Haskins LKML-Reference: <1247067476.9777.57.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_rt.c | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d..a17f3d9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -493,6 +493,7 @@ struct rt_rq { #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; + unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9bf0d2a..3918e01 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) #ifdef CONFIG_RT_GROUP_SCHED +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return rt_rq->rq; @@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) #else /* CONFIG_RT_GROUP_SCHED */ +#define rt_entity_is_task(rt_se) (1) + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return container_of(rt_rq, struct rq, rt); @@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq) static void update_rt_migration(struct rt_rq *rt_rq) { - if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { + if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { if (!rt_rq->overloaded) { rt_set_overload(rq_of_rt_rq(rt_rq)); rt_rq->overloaded = 1; @@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + if (!rt_entity_is_task(rt_se)) + return; + + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total++; if (rt_se->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; @@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + if (!rt_entity_is_task(rt_se)) + return; + + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total--; if (rt_se->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; -- cgit v1.1 From 7793527b90d9418211f4fe8464cc1dcb1631ea1b Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 9 Jul 2009 13:57:20 +0200 Subject: sched: Reset sched stats on fork() The sched_stat fields are currently not reset upon fork. Ingo's recent commit 6c594c21fcb02c662f11c97be4d7d2b73060a205 did reset nr_migrations, but it didn't reset any of the others. This patch resets all sched_stat fields on fork. Signed-off-by: Lucas De Marchi Signed-off-by: Peter Zijlstra LKML-Reference: <193b0f820907090457s7a3662f4gcdecdc22fcae857b@mail.gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a17f3d9..c4549bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2572,15 +2572,37 @@ static void __sched_fork(struct task_struct *p) p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sum_sleep_runtime = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.wait_max = 0; + p->se.wait_start = 0; + p->se.wait_max = 0; + p->se.wait_count = 0; + p->se.wait_sum = 0; + + p->se.sleep_start = 0; + p->se.sleep_max = 0; + p->se.sum_sleep_runtime = 0; + + p->se.block_start = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + + p->se.nr_migrations_cold = 0; + p->se.nr_failed_migrations_affine = 0; + p->se.nr_failed_migrations_running = 0; + p->se.nr_failed_migrations_hot = 0; + p->se.nr_forced_migrations = 0; + p->se.nr_forced2_migrations = 0; + + p->se.nr_wakeups = 0; + p->se.nr_wakeups_sync = 0; + p->se.nr_wakeups_migrate = 0; + p->se.nr_wakeups_local = 0; + p->se.nr_wakeups_remote = 0; + p->se.nr_wakeups_affine = 0; + p->se.nr_wakeups_affine_attempts = 0; + p->se.nr_wakeups_passive = 0; + p->se.nr_wakeups_idle = 0; + #endif INIT_LIST_HEAD(&p->rt.run_list); -- cgit v1.1 From c20b08e3986c2dbfa6df1e880bf4f7159994d199 Mon Sep 17 00:00:00 2001 From: Fabio Checconi Date: Mon, 15 Jun 2009 20:56:38 +0200 Subject: sched: Fix rt_rq->pushable_tasks initialization in init_rt_rq() init_rt_rq() initializes only rq->rt.pushable_tasks, and not the pushable_tasks field of the passed rt_rq. The plist is not used uninitialized since the only pushable_tasks plists used are the ones of root rt_rqs; anyway reinitializing the list on every group creation corrupts the root plist, losing its previous contents. Signed-off-by: Fabio Checconi Signed-off-by: Peter Zijlstra LKML-Reference: <20090615185638.GK21741@gandalf.sssup.it> CC: Gregory Haskins Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c4549bd..efecfda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9093,7 +9093,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rq->rt.pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; -- cgit v1.1 From 7e0c5086c172ecf8b0c2ad860b02a586967d17d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Jul 2009 13:52:32 +0200 Subject: hrtimer: migration: do not check expiry time on current CPU The timer migration code needs to check whether the expiry time of the timer is before the programmed clock event expiry time when the timer is enqueued on another CPU because we can not reprogram the timer device on the other CPU. The current logic checks the expiry time even if we enqueue on the current CPU when nohz_get_load_balancer() returns current CPU. This might lead to an endless loop in the expiry check code when the expiry time of the timer is before the current programmed next event. Check whether nohz_get_load_balancer() returns current CPU and skip the expiry check if this is the case. The bug was triggered from the networking code. The patch fixes the regression http://bugzilla.kernel.org/show_bug.cgi?id=13738 (Soft-Lockup/Race in networking in 2.6.31-rc1+195) Cc: Arun Bharadwaj Tested-by: Andres Freund Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9002958..126b980 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -206,8 +206,19 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { preferred_cpu = get_nohz_load_balancer(); - if (preferred_cpu >= 0) - cpu = preferred_cpu; + if (preferred_cpu >= 0) { + /* + * We must not check the expiry value when + * preferred_cpu is the current cpu. If base + * != new_base we would loop forever when the + * timer expires before the current programmed + * next timer event. + */ + if (preferred_cpu != cpu) + cpu = preferred_cpu; + else + preferred_cpu = -1; + } } #endif -- cgit v1.1 From 6ff7041dbfeb3bd7dfe9aa67275c21199ef760d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jul 2009 14:57:05 +0200 Subject: hrtimer: Fix migration expiry check The timer migration expiry check should prevent the migration of a timer to another CPU when the timer expires before the next event is scheduled on the other CPU. Migrating the timer might delay it because we can not reprogram the clock event device on the other CPU. But the code implementing that check has two flaws: - for !HIGHRES the check compares the expiry value with the clock events device expiry value which is wrong for CLOCK_REALTIME based timers. - the check is racy. It holds the hrtimer base lock of the target CPU, but the clock event device expiry value can be modified nevertheless, e.g. by an timer interrupt firing. The !HIGHRES case is easy to fix as we can enqueue the timer on the cpu which was selected by the load balancer. It runs the idle balancing code once per jiffy anyway. So the maximum delay for the timer is the same as when we keep the tick on the current cpu going. In the HIGHRES case we can get the next expiry value from the hrtimer cpu_base of the target CPU and serialize the update with the cpu_base lock. This moves the lock section in hrtimer_interrupt() so we can set next_event to KTIME_MAX while we are handling the expired timers and set it to the next expiry value after we handled the timers under the base lock. While the expired timers are processed timer migration is blocked because the expiry time of the timer is always <= KTIME_MAX. Also remove the now useless clockevents_get_next_event() function. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 121 ++++++++++++++++++++++++---------------------- kernel/time/clockevents.c | 11 ----- 2 files changed, 64 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 126b980..49da79a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } + +/* + * Get the preferred target CPU for NOHZ + */ +static int hrtimer_get_target(int this_cpu, int pinned) +{ +#ifdef CONFIG_NO_HZ + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { + int preferred_cpu = get_nohz_load_balancer(); + + if (preferred_cpu >= 0) + return preferred_cpu; + } +#endif + return this_cpu; +} + +/* + * With HIGHRES=y we do not migrate the timer when it is expiring + * before the next event on the target cpu because we cannot reprogram + * the target cpu hardware and we would cause it to fire late. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires; + + if (!new_base->cpu_base->hres_active) + return 0; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires.tv64 <= new_base->cpu_base->expires_next.tv64; +#else + return 0; +#endif +} + /* * Switch the timer base to the current CPU when possible. */ @@ -200,27 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, { struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; - int cpu, preferred_cpu = -1; - - cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - preferred_cpu = get_nohz_load_balancer(); - if (preferred_cpu >= 0) { - /* - * We must not check the expiry value when - * preferred_cpu is the current cpu. If base - * != new_base we would loop forever when the - * timer expires before the current programmed - * next timer event. - */ - if (preferred_cpu != cpu) - cpu = preferred_cpu; - else - preferred_cpu = -1; - } - } -#endif + int this_cpu = smp_processor_id(); + int cpu = hrtimer_get_target(this_cpu, pinned); again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); @@ -228,7 +249,7 @@ again: if (base != new_base) { /* - * We are trying to schedule the timer on the local CPU. + * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq @@ -244,38 +265,12 @@ again: spin_unlock(&base->cpu_base->lock); spin_lock(&new_base->cpu_base->lock); - /* Optimized away for NOHZ=n SMP=n */ - if (cpu == preferred_cpu) { - /* Calculate clock monotonic expiry time */ -#ifdef CONFIG_HIGH_RES_TIMERS - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), - new_base->offset); -#else - ktime_t expires = hrtimer_get_expires(timer); -#endif - - /* - * Get the next event on target cpu from the - * clock events layer. - * This covers the highres=off nohz=on case as well. - */ - ktime_t next = clockevents_get_next_event(cpu); - - ktime_t delta = ktime_sub(expires, next); - - /* - * We do not migrate the timer when it is expiring - * before the next event on the target cpu because - * we cannot reprogram the target cpu hardware and - * we would cause it to fire late. - */ - if (delta.tv64 < 0) { - cpu = smp_processor_id(); - spin_unlock(&new_base->cpu_base->lock); - spin_lock(&base->cpu_base->lock); - timer->base = base; - goto again; - } + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + spin_unlock(&new_base->cpu_base->lock); + spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; } timer->base = new_base; } @@ -1287,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) expires_next.tv64 = KTIME_MAX; + spin_lock(&cpu_base->lock); + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + base = cpu_base->clock_base; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { ktime_t basenow; struct rb_node *node; - spin_lock(&cpu_base->lock); - basenow = ktime_add(now, base->offset); while ((node = base->first)) { @@ -1327,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev) __run_hrtimer(timer); } - spin_unlock(&cpu_base->lock); base++; } + /* + * Store the new expiry value so the migration code can verify + * against it. + */ cpu_base->expires_next = expires_next; + spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ad6dd4..a6dcd67 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); - -ktime_t clockevents_get_next_event(int cpu) -{ - struct tick_device *td; - struct clock_event_device *dev; - - td = &per_cpu(tick_cpu_device, cpu); - dev = td->evtdev; - - return dev->next_event; -} #endif -- cgit v1.1 From d86ee4809d0329d4aa0d0f2c76c2295a16862799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:57 +0200 Subject: sched: optimize cond_resched() Optimize cond_resched() by removing one conditional. Currently cond_resched() checks system_state == SYSTEM_RUNNING in order to avoid scheduling before the scheduler is running. We can however, as per suggestion of Matt, use PREEMPT_ACTIVE to accomplish that very same. Suggested-by: Matt Mackall Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- kernel/sched.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d..01f55ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6541,6 +6541,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6560,8 +6565,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { - if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && - system_state == SYSTEM_RUNNING) { + if (should_resched()) { __cond_resched(); return 1; } @@ -6579,12 +6583,12 @@ EXPORT_SYMBOL(_cond_resched); */ int cond_resched_lock(spinlock_t *lock) { - int resched = need_resched() && system_state == SYSTEM_RUNNING; + int resched = should_resched(); int ret = 0; if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched && need_resched()) + if (resched) __cond_resched(); else cpu_relax(); @@ -6599,7 +6603,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && system_state == SYSTEM_RUNNING) { + if (should_resched()) { local_bh_enable(); __cond_resched(); local_bh_disable(); -- cgit v1.1 From d07387b490b1c43bfcb9f3900faf96f2dafb2630 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 10 Jul 2009 17:05:16 -0700 Subject: sched: Fix bug in SCHED_IDLE interaction with group scheduling One of the isolation modifications for SCHED_IDLE is the unitization of sleeper credit. However the check for this assumes that the sched_entity we're placing always belongs to a task. This is potentially not true with group scheduling and leaves us rummaging randomly when we try to pull the policy. Signed-off-by: Paul Turner Cc: peterz@infradead.org LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ba7fd6e..7c248dc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -687,7 +687,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) * all of which have the same weight. */ if (sched_feat(NORMALIZED_SLEEPER) && - task_of(se)->policy != SCHED_IDLE) + (!entity_is_task(se) || + task_of(se)->policy != SCHED_IDLE)) thresh = calc_delta_fair(thresh, se); vruntime -= thresh; -- cgit v1.1 From ce2ae53b750abfaa012ce408e93da131a5b5649b Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Fri, 10 Jul 2009 18:13:13 -0500 Subject: futexes: Fix infinite loop in get_futex_key() on huge page get_futex_key() can infinitely loop if it is called on a virtual address that is within a huge page but not aligned to the beginning of that page. The call to get_user_pages_fast will return the struct page for a sub-page within the huge page and the check for page->mapping will always fail. The fix is to call compound_head on the page before checking that it's mapped. Signed-off-by: Sonny Rao Acked-by: Thomas Gleixner Cc: stable@kernel.org Cc: anton@samba.org Cc: rajamony@us.ibm.com Cc: speight@us.ibm.com Cc: mstephen@us.ibm.com Cc: grimm@us.ibm.com Cc: mikey@ozlabs.au.ibm.com LKML-Reference: <20090710231313.GA23572@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 794c862..0672ff8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -247,6 +247,7 @@ again: if (err < 0) return err; + page = compound_head(page); lock_page(page); if (!page->mapping) { unlock_page(page); -- cgit v1.1 From 405f55712dfe464b3240d7816cc4fe4174831be2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 11 Jul 2009 22:08:37 +0400 Subject: headers: smp_lock.h redux * Remove smp_lock.h from files which don't need it (including some headers!) * Add smp_lock.h to files which do need it * Make smp_lock.h include conditional in hardirq.h It's needed only for one kernel_locked() usage which is under CONFIG_PREEMPT This will make hardirq.h inclusion cheaper for every PREEMPT=n config (which includes allmodconfig/allyesconfig, BTW) Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- kernel/power/user.c | 1 - kernel/trace/blktrace.c | 1 + kernel/trace/trace.c | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index ed97375..bf0014d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 39af8af..1090b0a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3aa0a0d..8bc8d8a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From d4d7d0b9545721d3cabb19d15163bbc66b797707 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 6 Jul 2009 09:31:33 +0100 Subject: perf_counter: Fix the tracepoint channel to perfcounters Fix a missed rename in EVENT_PROFILE support so that it gets built and allows tracepoint tracing from the 'perf' tool. Fix a typo in the (never before built & enabled) portion in perf_counter.c as well, and update that code to the attr.config changes as well. Signed-off-by: Chris Wilson Cc: Ben Gamari Cc: Jason Baron Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt LKML-Reference: <1246869094-21237-1-git-send-email-chris@chris-wilson.co.uk> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50d..c6c38fb 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3671,7 +3671,7 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id) { struct perf_sample_data data = { - .regs = get_irq_regs(); + .regs = get_irq_regs(), .addr = 0, }; @@ -3687,16 +3687,12 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->attr)); + ftrace_profile_disable(counter->attr.config); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->attr); - int ret; - - ret = ftrace_profile_enable(event_id); - if (ret) + if (ftrace_profile_enable(counter->attr.config)) return NULL; counter->destroy = tp_perf_counter_destroy; -- cgit v1.1 From 6ab5d668b131d3c5416f6df1d3ca95b82d4fe8a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Jun 2009 00:55:45 -0400 Subject: tracing/function-profiler: do not free per cpu variable stat The per cpu variable stat is freeded if we fail to allocate a name on start up. This was due to stat at first being allocated in the initial design. But since then, it has become a static per cpu variable but the free on error was not removed. Also added __init annotation to the function that this is in. [ Impact: prevent possible memory corruption on low mem at boot up ] Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bce9e01..4521c77 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -768,7 +768,7 @@ static struct tracer_stat function_stats __initdata = { .stat_show = function_stat_show }; -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; struct dentry *entry; @@ -786,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) * The files created are permanent, if something happens * we still do not free memory. */ - kfree(stat); WARN(1, "Could not allocate stat file for cpu %d\n", cpu); @@ -813,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) } #else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { } #endif /* CONFIG_FUNCTION_PROFILER */ -- cgit v1.1 From 04aef32d39cc4ef80087c0ce8ed113c6d64f1a6b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 15 Jul 2009 12:29:06 +0800 Subject: tracing/function: Fix the return value of ftrace_trace_onoff_callback() ftrace_trace_onoff_callback() will return an error even if we do the right operation, for example: # echo _spin_*:traceon:10 > set_ftrace_filter -bash: echo: write error: Invalid argument # cat set_ftrace_filter #### all functions enabled #### _spin_trylock_bh:traceon:count=10 _spin_unlock_irq:traceon:count=10 _spin_unlock_bh:traceon:count=10 _spin_lock_irq:traceon:count=10 _spin_unlock:traceon:count=10 _spin_trylock:traceon:count=10 _spin_unlock_irqrestore:traceon:count=10 _spin_lock_irqsave:traceon:count=10 _spin_lock_bh:traceon:count=10 _spin_lock:traceon:count=10 We want to set _spin_*:traceon:10 to set_ftrace_filter, it complains with "Invalid argument", but the operation is successful. This is because ftrace_process_regex() returns the number of functions that matched the pattern. If the number is not 0, this value is returned by ftrace_regex_write() whereas we want to return the number of bytes virtually written. Also the file offset pointer is not updated in this case. If the number of matched functions is lower than the number of bytes written by the user, this results to a reprocessing of the string given by the user with a lower size, leading to a malformed ftrace regex and then a -EINVAL returned. So, this patch fixes it by returning 0 if no error occured. The fix also applies on 2.6.30 Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Cc: stable@kernel.org Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7402144..75ef000 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -363,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) out_reg: ret = register_ftrace_function_probe(glob, ops, count); - return ret; + return ret < 0 ? ret : 0; } static struct ftrace_func_command ftrace_traceon_cmd = { -- cgit v1.1 From 54fdc5816631b43ba55fc3206d7add2d85850bc6 Mon Sep 17 00:00:00 2001 From: Fabio Checconi Date: Thu, 16 Jul 2009 12:32:27 +0200 Subject: sched: Account for vruntime wrapping I spotted two sites that didn't take vruntime wrap-around into account. Fix these by creating a comparison helper that does do so. Signed-off-by: Fabio Checconi Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7c248dc..9ffb2b2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } +static inline int entity_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->vruntime - b->vruntime) < 0; +} + static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return se->vruntime - cfs_rq->min_vruntime; @@ -1017,7 +1023,7 @@ static void yield_task_fair(struct rq *rq) /* * Already in the rightmost position? */ - if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) + if (unlikely(!rightmost || entity_before(rightmost, se))) return; /* @@ -1713,7 +1719,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) /* 'curr' will be NULL if the child belongs to a different group */ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && curr->vruntime < se->vruntime) { + curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. -- cgit v1.1 From 413ee3b48ab582ffea33e7e140c7a2c5ea657e9a Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:15:52 +0200 Subject: perf_counter: Make sure we dont leak kernel memory to userspace There are a few places we are leaking tiny amounts of kernel memory to userspace. This happens when writing out strings because we always align the end to 64 bits. To avoid this we should always use an appropriately sized temporary buffer and ensure it is zeroed. Since d_path assembles the string from the end of the buffer backwards, we need to add 64 bits after the buffer to allow for alignment. We also need to copy arch_vma_name to the temporary buffer, because if we use it directly we may end up copying to userspace a number of bytes after the end of the string constant. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.273972048@samba.org> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c6c38fb..f7a8ab9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2968,8 +2968,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; unsigned int size; - char *comm = comm_event->task->comm; + char comm[TASK_COMM_LEN]; + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3088,8 +3090,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) char *buf = NULL; const char *name; + memset(tmp, 0, sizeof(tmp)); + if (file) { - buf = kzalloc(PATH_MAX, GFP_KERNEL); + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; @@ -3100,9 +3109,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } } else { - name = arch_vma_name(mmap_event->vma); - if (name) + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); goto got_name; + } if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); -- cgit v1.1 From ed900c054b541254f0ce5cedaf75206e29bd614e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: perf_counter: Log vfork as a fork event Right now we don't output vfork events. Even though we should always see an exec after a vfork, we may get perfcounter samples between the vfork and exec. These samples can lead to some confusion when parsing perfcounter data. To keep things consistent we should always log a fork event. It will result in a little more log data, but is less confusing to trace parsing tools. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.589309391@samba.org> Signed-off-by: Ingo Molnar --- kernel/fork.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 467746b..4812d60 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1408,14 +1408,11 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); - } else if (!(clone_flags & CLONE_VM)) { - /* - * vfork will do an exec which will call - * set_task_comm() - */ - perf_counter_fork(p); } + if (!(clone_flags & CLONE_THREAD)) + perf_counter_fork(p); + audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); -- cgit v1.1 From e5d490b252423605a77c54b2e35b10ea663763df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Jul 2009 12:23:11 +0100 Subject: profile: Suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Cc: Linux Memory Management List Cc: Heinz Diehl Cc: David Miller Cc: Arnaldo Carvalho de Melo Cc: Mel Gorman Cc: Andrew Morton LKML-Reference: <1247656992-19846-3-git-send-email-mel@csn.ul.ie> Signed-off-by: Ingo Molnar --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5..419250e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.1 From a468d389349a7560249b355cdb6d2097ea1616c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:46 +0200 Subject: sched: fix load average accounting vs. cpu hotplug The new load average code clears rq->calc_load_active on CPU_ONLINE. That's wrong as the new onlined CPU might have got a scheduler tick already and accounted the delta to the stale value of the time we offlined the CPU. Clear the value when we cleanup the dead CPU instead. Also move the update of the calc_load_update time for the newly online CPU to CPU_UP_PREPARE to avoid that the CPU plays catch up with the stale update time value. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 98972d3..1b59e26 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7289,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static void calc_global_load_remove(struct rq *rq) { atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } #endif /* CONFIG_HOTPLUG_CPU */ @@ -7515,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) task_rq_unlock(rq, &flags); get_task_struct(p); cpu_rq(cpu)->migration_thread = p; + rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: @@ -7525,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); - rq->calc_load_update = calc_load_update; - rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -- cgit v1.1 From 6301cb95c119ebf324bb96ee226fa9ddffad80a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:47 +0200 Subject: sched: fix nr_uninterruptible accounting of frozen tasks really commit e3c8ca8336 (sched: do not count frozen tasks toward load) broke the nr_uninterruptible accounting on freeze/thaw. On freeze the task is excluded from accounting with a check for (task->flags & PF_FROZEN), but that flag is cleared before the task is thawed. So while we prevent that the task with state TASK_UNINTERRUPTIBLE is accounted to nr_uninterruptible on freeze we decrement nr_uninterruptible on thaw. Use a separate flag which is handled by the freezing task itself. Set it before calling the scheduler with TASK_UNINTERRUPTIBLE state and clear it after we return from frozen state. Cc: Signed-off-by: Thomas Gleixner --- kernel/freezer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 2f4936c..bd1d42b 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,12 +44,19 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); + /* prevent accounting of that task to load */ + current->flags |= PF_FREEZING; + for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!frozen(current)) break; schedule(); } + + /* Remove the accounting blocker */ + current->flags &= ~PF_FREEZING; + pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } -- cgit v1.1 From 4841158b26e28e1476eed84c7347c18f11317750 Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Sat, 18 Jul 2009 16:46:02 -0400 Subject: timer: Avoid reading uninitialized data timer->expires may be uninitialized, so check timer_pending() before touching timer->expires to pacify kmemcheck. Signed-off-by: Pavel Roskin LKML-Reference: <20090718204602.5191.360.stgit@mj.roinet.com> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e..a7f07d5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -714,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) * networking code - if the timer is re-modified * to be the same thing then just return: */ - if (timer->expires == expires && timer_pending(timer)) + if (timer_pending(timer) && timer->expires == expires) return 1; return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); -- cgit v1.1 From 79ef2bb01445400def20c7993b27fbcad27ca95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jul 2009 17:09:12 +0200 Subject: clocksource: Prevent NULL pointer dereference Writing a zero length string to sys/.../current_clocksource will cause a NULL pointer dereference if the clock events system is in one shot (highres or nohz) mode. Pointed-out-by: Dan Carpenter LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 592bf58..7466cb8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Check to make sure we don't switch to a non-highres capable * clocksource if the tick code is in oneshot mode (highres or nohz) */ - if (tick_oneshot_mode_active() && + if (tick_oneshot_mode_active() && ovr && !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { printk(KERN_WARNING "%s clocksource is not HRT compatible. " "Cannot switch while in HRT/NOHZ mode\n", ovr->name); -- cgit v1.1 From 591d2fb02ea80472d846c0b8507007806bdd69cc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 11:09:39 +0200 Subject: genirq: Delegate irq affinity setting to the irq thread irq_set_thread_affinity() calls set_cpus_allowed_ptr() which might sleep, but irq_set_thread_affinity() is called with desc->lock held and can be called from hard interrupt context as well. The code has another bug as it does not hold a ref on the task struct as required by set_cpus_allowed_ptr(). Just set the IRQTF_AFFINITY bit in action->thread_flags. The next time the thread runs it migrates itself. Solves all of the above problems nicely. Add kerneldoc to irq_set_thread_affinity() while at it. Signed-off-by: Thomas Gleixner LKML-Reference: --- kernel/irq/internals.h | 3 +-- kernel/irq/manage.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/irq/migration.c | 2 +- 3 files changed, 46 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 7346825..e70ed55 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); -extern void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); +extern void irq_set_thread_affinity(struct irq_desc *desc); /* * Debugging printout: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50da676..f0de36f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +/** + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affitnity changed + * + * We just set IRQTF_AFFINITY and delegate the affinity setting + * to the interrupt thread itself. We can not call + * set_cpus_allowed_ptr() here as we hold desc->lock and this + * code can be called from hard interrupt context. + */ +void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action = desc->action; while (action) { if (action->thread) - set_cpus_allowed_ptr(action->thread, cpumask); + set_bit(IRQTF_AFFINITY, &action->thread_flags); action = action->next; } } @@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (desc->status & IRQ_MOVE_PCNTXT) { if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } } else { @@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #else if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } #endif desc->status |= IRQ_AFFINITY_SET; @@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) - irq_set_thread_affinity(desc, desc->affinity); + irq_set_thread_affinity(desc); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -444,6 +452,34 @@ static int irq_wait_for_interrupt(struct irqaction *action) } /* + * Check whether we need to change the affinity of the interrupt thread. + */ +static void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ + cpumask_var_t mask; + + if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) + return; + + /* + * In case we are out of memory we set IRQTF_AFFINITY again and + * try again next time + */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + set_bit(IRQTF_AFFINITY, &action->thread_flags); + return; + } + + spin_lock_irq(&desc->lock); + cpumask_copy(mask, desc->affinity); + spin_unlock_irq(&desc->lock); + + set_cpus_allowed_ptr(current, mask); + free_cpumask_var(mask); +} + +/* * Interrupt handler thread */ static int irq_thread(void *data) @@ -458,6 +494,8 @@ static int irq_thread(void *data) while (!irq_wait_for_interrupt(action)) { + irq_thread_check_affinity(desc, action); + atomic_inc(&desc->threads_active); spin_lock_irq(&desc->lock); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index cfe767c..fcb6c96 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -45,7 +45,7 @@ void move_masked_irq(int irq) < nr_cpu_ids)) if (!desc->chip->set_affinity(irq, desc->pending_mask)) { cpumask_copy(desc->affinity, desc->pending_mask); - irq_set_thread_affinity(desc, desc->pending_mask); + irq_set_thread_affinity(desc); } cpumask_clear(desc->pending_mask); -- cgit v1.1 From 9ba5f005c994ad28e266a0cd14ef29354be382c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 14:18:35 +0200 Subject: softirq: introduce tasklet_hrtimer infrastructure commit ca109491f (hrtimer: removing all ur callback modes) moved all hrtimer callbacks into hard interrupt context when high resolution timers are active. That breaks code which relied on the assumption that the callback happens in softirq context. Provide a generic infrastructure which combines tasklets and hrtimers together to provide an in-softirq hrtimer experience. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: kaber@trash.net Cc: David Miller LKML-Reference: <1248265724.27058.1366.camel@twins> Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 3a94905..eb5e131 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) softirq_vec[nr].action = action; } -/* Tasklets */ +/* + * Tasklets + */ struct tasklet_head { struct tasklet_struct *head; @@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +/* + * tasklet_hrtimer + */ + +/* + * The trampoline is called when the hrtimer expires. If this is + * called from the hrtimer interrupt then we schedule the tasklet as + * the timer callback function expects to run in softirq context. If + * it's called in softirq context anyway (i.e. high resolution timers + * disabled) then the hrtimer callback is called right away. + */ +static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) +{ + struct tasklet_hrtimer *ttimer = + container_of(timer, struct tasklet_hrtimer, timer); + + if (hrtimer_is_hres_active(timer)) { + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; + } + return ttimer->function(timer); +} + +/* + * Helper function which calls the hrtimer callback from + * tasklet/softirq context + */ +static void __tasklet_hrtimer_trampoline(unsigned long data) +{ + struct tasklet_hrtimer *ttimer = (void *)data; + enum hrtimer_restart restart; + + restart = ttimer->function(&ttimer->timer); + if (restart != HRTIMER_NORESTART) + hrtimer_restart(&ttimer->timer); +} + +/** + * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks + * @ttimer: tasklet_hrtimer which is initialized + * @function: hrtimer callback funtion which gets called from softirq context + * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) + * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) + */ +void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode) +{ + hrtimer_init(&ttimer->timer, which_clock, mode); + ttimer->timer.function = __hrtimer_tasklet_trampoline; + tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, + (unsigned long)ttimer); + ttimer->function = function; +} +EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); + +/* + * Remote softirq bits + */ + DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); EXPORT_PER_CPU_SYMBOL(softirq_work_list); -- cgit v1.1 From c9f73a3dd27e03411f18a58c0814d51392d2b17a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: Fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5c6fae4..ff854fd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2666,6 +2666,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.1 From 573402db02746179b3f95f83a11a787501f52d0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 11:13:50 +0200 Subject: perf_counter: Plug more stack leaks Per example of Arjan's patch, I went through and found a few more. Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff854fd..e1d6a3a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2897,8 +2897,11 @@ void perf_counter_fork(struct task_struct *task) .event = { .header = { .type = PERF_EVENT_FORK, + .misc = 0, .size = sizeof(fork_event.event), }, + /* .pid */ + /* .ppid */ }, }; @@ -3008,8 +3011,16 @@ void perf_counter_comm(struct task_struct *task) comm_event = (struct perf_comm_event){ .task = task, + /* .comm */ + /* .comm_size */ .event = { - .header = { .type = PERF_EVENT_COMM, }, + .header = { + .type = PERF_EVENT_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ }, }; @@ -3160,8 +3171,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma) mmap_event = (struct perf_mmap_event){ .vma = vma, + /* .file_name */ + /* .file_size */ .event = { - .header = { .type = PERF_EVENT_MMAP, }, + .header = { + .type = PERF_EVENT_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = vma->vm_pgoff, -- cgit v1.1 From 7f453c24b95a085fc7bd35d53b33abc4dc5a048b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:19:40 +0200 Subject: perf_counter: PERF_SAMPLE_ID and inherited counters Anton noted that for inherited counters the counter-id as provided by PERF_SAMPLE_ID isn't mappable to the id found through PERF_RECORD_ID because each inherited counter gets its own id. His suggestion was to always return the parent counter id, since that is the primary counter id as exposed. However, these inherited counters have a unique identifier so that events like PERF_EVENT_PERIOD and PERF_EVENT_THROTTLE can be specific about which counter gets modified, which is important when trying to normalize the sample streams. This patch removes PERF_EVENT_PERIOD in favour of PERF_SAMPLE_PERIOD, which is more useful anyway, since changing periods became a lot more common than initially thought -- rendering PERF_EVENT_PERIOD the less useful solution (also, PERF_SAMPLE_PERIOD reports the more accurate value, since it reports the value used to trigger the overflow, whereas PERF_EVENT_PERIOD simply reports the requested period changed, which might only take effect on the next cycle). This still leaves us PERF_EVENT_THROTTLE to consider, but since that _should_ be a rare occurrence, and linking it to a primary id is the most useful bit to diagnose the problem, we introduce a PERF_SAMPLE_STREAM_ID, for those few cases where the full reconstruction is important. [Does change the ABI a little, but I see no other way out] Suggested-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <1248095846.15751.8781.camel@twins> --- kernel/perf_counter.c | 92 +++++++++++++++++---------------------------------- 1 file changed, 31 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e1d6a3a..7530588 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -155,6 +155,20 @@ static void unclone_ctx(struct perf_counter_context *ctx) } /* + * If we inherit counters we want to return the parent counter id + * to userspace. + */ +static u64 primary_counter_id(struct perf_counter *counter) +{ + u64 id = counter->id; + + if (counter->parent) + id = counter->parent->id; + + return id; +} + +/* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, * the context could get moved to another task. @@ -1296,7 +1310,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_counter *counter, int enable); -static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_period(struct perf_counter *counter, u64 events) { @@ -1315,8 +1328,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events) if (!sample_period) sample_period = 1; - perf_log_period(counter, sample_period); - hwc->sample_period = sample_period; } @@ -1705,7 +1716,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = counter->id; + values[n++] = primary_counter_id(counter); mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) @@ -1812,8 +1823,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->attr.sample_freq = value; } else { - perf_log_period(counter, value); - counter->attr.sample_period = value; counter->hw.sample_period = value; } @@ -2662,6 +2671,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_STREAM_ID) + header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_CPU) { header.size += sizeof(cpu_entry); @@ -2705,7 +2717,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, data->addr); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & PERF_SAMPLE_ID) { + u64 id = primary_counter_id(counter); + + perf_output_put(&handle, id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(&handle, counter->id); if (sample_type & PERF_SAMPLE_CPU) @@ -2728,7 +2746,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sub != counter) sub->pmu->read(sub); - group_entry.id = sub->id; + group_entry.id = primary_counter_id(sub); group_entry.counter = atomic64_read(&sub->count); perf_output_put(&handle, group_entry); @@ -2788,15 +2806,8 @@ perf_counter_read_event(struct perf_counter *counter, } if (counter->attr.read_format & PERF_FORMAT_ID) { - u64 id; - event.header.size += sizeof(u64); - if (counter->parent) - id = counter->parent->id; - else - id = counter->id; - - event.format[i++] = id; + event.format[i++] = primary_counter_id(counter); } ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); @@ -3191,49 +3202,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma) } /* - * Log sample_period changes so that analyzing tools can re-normalize the - * event flow. - */ - -struct freq_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 period; -}; - -static void perf_log_period(struct perf_counter *counter, u64 period) -{ - struct perf_output_handle handle; - struct freq_event event; - int ret; - - if (counter->hw.sample_period == period) - return; - - if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) - return; - - event = (struct freq_event) { - .header = { - .type = PERF_EVENT_PERIOD, - .misc = 0, - .size = sizeof(event), - }, - .time = sched_clock(), - .id = counter->id, - .period = period, - }; - - ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_end(&handle); -} - -/* * IRQ throttle logging */ @@ -3246,14 +3214,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct perf_event_header header; u64 time; u64 id; + u64 stream_id; } throttle_event = { .header = { .type = PERF_EVENT_THROTTLE + 1, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), - .id = counter->id, + .time = sched_clock(), + .id = primary_counter_id(counter), + .stream_id = counter->id, }; ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); -- cgit v1.1 From 966ee4d6b887c14159043ac80b8c3661d2bbe5e2 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 22 Jul 2009 23:05:46 +1000 Subject: perf_counter: Fix throttle/unthrottle event logging Right now we only print PERF_EVENT_THROTTLE + 1 (ie PERF_EVENT_UNTHROTTLE). Fix this to print both a throttle and unthrottle event. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090722130546.GE9029@kryten> --- kernel/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7530588..787d4da 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3217,7 +3217,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) u64 stream_id; } throttle_event = { .header = { - .type = PERF_EVENT_THROTTLE + 1, + .type = PERF_EVENT_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, @@ -3226,6 +3226,9 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .stream_id = counter->id, }; + if (enable) + throttle_event.header.type = PERF_EVENT_UNTHROTTLE; + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; -- cgit v1.1 From 0dc3d523e8bc4718e0be2e4a742367d6e4be77cd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a641eb7..7bc888d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2665,6 +2665,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.1 From 61f3826133dc07142935fb5712fc738e19eb5575 Mon Sep 17 00:00:00 2001 From: Bruno Premont Date: Wed, 22 Jul 2009 22:22:32 +0200 Subject: genirq: Fix UP compile failure caused by irq_thread_check_affinity Since genirq: Delegate irq affinity setting to the irq thread (591d2fb02ea80472d846c0b8507007806bdd69cc) compilation with CONFIG_SMP=n fails with following error: /usr/src/linux-2.6/kernel/irq/manage.c: In function 'irq_thread_check_affinity': /usr/src/linux-2.6/kernel/irq/manage.c:475: error: 'struct irq_desc' has no member named 'affinity' make[4]: *** [kernel/irq/manage.o] Error 1 That commit adds a new function irq_thread_check_affinity() which uses struct irq_desc.affinity which is only available for CONFIG_SMP=y. Move that function under #ifdef CONFIG_SMP. [ tglx@brownpaperbag: compile and boot tested on UP and SMP ] Signed-off-by: Bruno Premont LKML-Reference: <20090722222232.2eb3e1c4@neptune.home> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0de36f..61c679d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -451,6 +451,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +#ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ @@ -478,6 +479,10 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } +#else +static inline void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } +#endif /* * Interrupt handler thread -- cgit v1.1 From d8cc1ab793993c886c62abf77c93287df33ffd8b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:28:40 +0800 Subject: trace_stack: Fix seqfile memory leak Every time we cat stack_trace, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D8E8.3020500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af9..6a2a9d4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &stack_trace_seq_ops); - - return ret; + return seq_open(file, &stack_trace_seq_ops); } static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, + .release = seq_release, }; int -- cgit v1.1 From 87827111a5538633b18e5c641ced673c4c2bb6ce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:11 +0800 Subject: function-graph: Fix seqfile memory leak Every time we cat set_graph_function, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D907.2010500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4521c77..1f3ec2a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2596,6 +2596,14 @@ ftrace_graph_open(struct inode *inode, struct file *file) } static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer) { struct dyn_ftrace *rec; @@ -2724,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, } static const struct file_operations ftrace_graph_fops = { - .open = ftrace_graph_open, - .read = seq_read, - .write = ftrace_graph_write, + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .release = ftrace_graph_release, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.1 From 636eacee3b0c76915151db37203cc624becb6d7b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:47 +0800 Subject: tracing/stat: Fix seqfile memory leak Every time we cat a trace_stat file, we leak memory allocated by seq_open(). Also fix memory leak in a failure path in tracing_stat_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D92B.4060704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stat.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e4..aea321c 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node) } } -static void reset_stat_session(struct stat_session *session) +static void __reset_stat_session(struct stat_session *session) { struct rb_node *node = session->stat_root.rb_node; @@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session) session->stat_root = RB_ROOT; } +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); - reset_stat_session(session); + __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); } @@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session) int i; mutex_lock(&session->stat_mutex); - reset_stat_session(session); + __reset_stat_session(session); if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; @@ -183,7 +190,7 @@ exit: return ret; exit_free_rbtree: - reset_stat_session(session); + __reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; } @@ -250,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = { static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - + struct seq_file *m; struct stat_session *session = inode->i_private; + ret = stat_seq_init(session); + if (ret) + return ret; + ret = seq_open(file, &trace_stat_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = session; - ret = stat_seq_init(session); + if (ret) { + reset_stat_session(session); + return ret; } + m = file->private_data; + m->private = session; return ret; } @@ -270,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f) { struct stat_session *session = i->i_private; - mutex_lock(&session->stat_mutex); reset_stat_session(session); - mutex_unlock(&session->stat_mutex); - return 0; + return seq_release(i, f); } static const struct file_operations tracing_stat_fops = { -- cgit v1.1 From 4c739ff043e5787d97c9691d62cabf7a29e75a9d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:11:03 -0400 Subject: tracing: show proper address for trace-printk format Since the trace_printk may use pointers to the format fields in the buffer, they are exported via debugfs/tracing/printk_formats. This is used by utilities that read the ring buffer in binary format. It helps the utilities map the address of the format in the binary buffer to what the printf format looks like. Unfortunately, the way the output code works, it exports the address of the pointer to the format address, and not the format address itself. This makes the file totally useless in trying to figure out what format string a binary address belongs to. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7b62781..687699d 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; - seq_printf(m, "0x%lx : \"", (unsigned long)fmt); + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* * Tabs and new lines need to be converted. -- cgit v1.1 From 8650ae32ef7045e763825dee6256dde7f331bb85 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:29:30 -0400 Subject: tracing: only truncate ftrace files when O_TRUNC is set The current code will truncate the ftrace files contents if O_APPEND is not set and the file is opened in write mode. This is incorrect. It should only truncate the file if O_TRUNC is set. Otherwise if one of these files is opened by a C program with fopen "r+", it will incorrectly truncate the file. Reported-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1f3ec2a..1e1d23c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1662,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { @@ -2577,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8a..d8ef285 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2031,7 +2031,7 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { long cpu = (long) inode->i_private; if (cpu == TRACE_PIPE_ALL_CPU) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd3..23d2972 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_clear_events(); seq_ops = inode->i_private; -- cgit v1.1 From bdff78707f3ce47e891f3201c9666122a70556ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2009 15:30:45 -0400 Subject: trace: stop tracer in oops_enter() If trace_printk_on_oops is set we lose interesting trace information when the tracer is enabled across oops handling and printing. We want the trace which might give us information _WHY_ we oopsed. Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 984b3ec..512ab73 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -301,6 +301,7 @@ int oops_may_print(void) */ void oops_enter(void) { + tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); -- cgit v1.1 From 6560dc160f3a96b8f1f43e2c6b51aa6eb9898b90 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 23 Jul 2009 23:42:08 +0930 Subject: module: use MODULE_SYMBOL_PREFIX with module_layout The check_modstruct_version() needs to look up the symbol "module_layout" in the kernel, but it does so literally and not by a C identifier. The trouble is that it does not include a symbol prefix for those ports that need it (like the Blackfin and H8300 port). So make sure we tack on the MODULE_SYMBOL_PREFIX define to the front of it. Signed-off-by: Mike Frysinger Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0a04983..fd141140 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; - if (!find_symbol("module_layout", NULL, &crc, true, false)) + if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + &crc, true, false)) BUG(); return check_version(sechdrs, versindex, "module_layout", mod, crc); } -- cgit v1.1 From 9ae260270c90643156cda73427aa1f04c923e627 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jun 2009 02:51:13 +0200 Subject: update the comment in kthread_stop() Commit 63706172f332fd3f6e7458ebfb35fa6de9c21dc5 ("kthreads: rework kthread_stop()") removed the limitation that the thread function mysr not call do_exit() itself, but forgot to update the comment. Since that commit it is OK to use kthread_stop() even if kthread can exit itself. Signed-off-by: Oleg Nesterov Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/kthread.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9b1a7de..eb8751a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind); * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. Your threadfn() must not call do_exit() - * itself if you use this function! This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. -- cgit v1.1 From 38ceb592fcac9110c6b3c87ea0a27bff68c43486 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:11:24 +0800 Subject: tracing: Fix invalid function_graph entry When print_graph_entry() computes a function call entry event, it needs to also check the next entry to guess if it matches the return event of the current function entry. In order to look at this next event, it needs to consume the current entry before going ahead in the ring buffer. However, if the current event that gets consumed is the last one in the ring buffer head page, the ring_buffer may reuse the page for writers. The consumed entry will then become invalid because of possible racy overwriting. Me must then handle this entry by making a copy of it. The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEAEC.3050508@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249ab..420ec34 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter) switch (entry->type) { case TRACE_GRAPH_ENT: { - struct ftrace_graph_ent_entry *field; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *field, saved; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter); + saved = *field; + return print_graph_entry(&saved, s, iter); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; -- cgit v1.1 From 74e7ff8c50b6b022e6ffaa736b16a4dc161d3eaf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:17:22 +0800 Subject: tracing: Fix missing function_graph events when we splice_read from trace_pipe About a half events are missing when we splice_read from trace_pipe. They are unexpectedly consumed because we ignore the TRACE_TYPE_NO_CONSUME return value used by the function graph tracer when it needs to consume the events by itself to walk on the ring buffer. The same problem appears with ftrace_dump() Example of an output before this patch: 1) | ktime_get_real() { 1) 2.846 us | read_hpet(); 1) 4.558 us | } 1) 6.195 us | } After this patch: 0) | ktime_get_real() { 0) | getnstimeofday() { 0) 1.960 us | read_hpet(); 0) 3.597 us | } 0) 5.196 us | } The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEC52.90704@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8a..da984ad 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3085,7 +3085,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) break; } - trace_consume(iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); rem -= count; if (!find_next_entry_inc(iter)) { rem = 0; @@ -4233,8 +4234,11 @@ static void __ftrace_dump(bool disable_tracing) iter.pos = -1; if (find_next_entry_inc(&iter) != NULL) { - print_trace_line(&iter); - trace_consume(&iter); + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); } trace_printk_seq(&iter.seq); -- cgit v1.1 From 933b787b57ca8bdc0fc8fb2cbf67b5e6d21beb84 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 29 Jul 2009 15:02:07 -0700 Subject: mm: copy over oom_adj value at fork time Fix a post-2.6.31 regression which was introduced by 2ff05b2b4eac2e63d345fc731ea151a060247f53 ("oom: move oom_adj value from task_struct to mm_struct"). After moving the oom_adj value from the task struct to the mm_struct, the oom_adj value was no longer properly inherited by child processes. Copying over the oom_adj value at fork time fixes that bug. [kosaki.motohiro@jp.fujitsu.com: test for current->mm before dereferencing it] Signed-off-by: Rik van Riel Reported-by: Paul Menage Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9b42695..29b532e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,6 +426,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.1 From 11c7da4b0ca76a57f51c996c883c480e203cf5a9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 29 Jul 2009 15:02:08 -0700 Subject: kexec: fix omitting offset in extended crashkernel syntax Setting "crashkernel=512M-2G:64M,2G-:128M" does not work but it turns to work if it has a trailing-whitespace, like "crashkernel=512M-2G:64M,2G-:128M ". It was because of a bug in the parser, running over the cmdline. This patch adds a check of the termination. Reported-by: Jin Dongming Signed-off-by: Hidetoshi Seto Tested-by: Jin Dongming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ae1c352..f336e21 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline, } while (*cur++ == ','); if (*crash_size > 0) { - while (*cur != ' ' && *cur != '@') + while (*cur && *cur != ' ' && *cur != '@') cur++; if (*cur == '@') { cur++; -- cgit v1.1 From 096b7fe012d66ed55e98bc8022405ede0cc80e96 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Jul 2009 15:04:04 -0700 Subject: cgroups: fix pid namespace bug The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844 ("cgroups: convert tasks file to use a seq_file with shared pid array"). We cache a pid array for all threads that are opening the same "tasks" file, but the pids in the array are always from the namespace of the last process that opened the file, so all other threads will read pids from that namespace instead of their own namespaces. To fix it, we maintain a list of pid arrays, which is keyed by pid_ns. The list will be of length 1 at most time. Reported-by: Paul Menage Idea-by: Paul Menage Signed-off-by: Li Zefan Reviewed-by: Serge Hallyn Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 96 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a68..250dac0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); + INIT_LIST_HEAD(&cgrp->pids_list); init_rwsem(&cgrp->pids_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2201,12 +2203,30 @@ err: return ret; } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { + /* The node in cgrp->pids_list */ + struct list_head list; + /* The cgroup those pids belong to */ + struct cgroup *cgrp; + /* The namepsace those pids belong to */ + struct pid_namespace *ns; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the this tasks_pids array */ + int use_count; + /* Length of the current tasks_pids array */ + int length; +}; + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } - /* * seq_file methods for the "tasks" file. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; int index = 0, pid = *pos; int *iter; down_read(&cgrp->pids_mutex); if (pid) { - int end = cgrp->pids_length; + int end = cp->length; while (index < end) { int mid = (index + end) / 2; - if (cgrp->tasks_pids[mid] == pid) { + if (cp->tasks_pids[mid] == pid) { index = mid; break; - } else if (cgrp->tasks_pids[mid] <= pid) + } else if (cp->tasks_pids[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cgrp->pids_length) + if (index >= cp->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cgrp->tasks_pids + index; + iter = cp->tasks_pids + index; *pos = *iter; return iter; } static void cgroup_tasks_stop(struct seq_file *s, void *v) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; up_read(&cgrp->pids_mutex); } static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; int *p = v; - int *end = cgrp->tasks_pids + cgrp->pids_length; + int *end = cp->tasks_pids + cp->length; /* * Advance to the next pid in the array. If this goes off the @@ -2286,26 +2308,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { .show = cgroup_tasks_show, }; -static void release_cgroup_pid_array(struct cgroup *cgrp) +static void release_cgroup_pid_array(struct cgroup_pids *cp) { + struct cgroup *cgrp = cp->cgrp; + down_write(&cgrp->pids_mutex); - BUG_ON(!cgrp->pids_use_count); - if (!--cgrp->pids_use_count) { - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = NULL; - cgrp->pids_length = 0; + BUG_ON(!cp->use_count); + if (!--cp->use_count) { + list_del(&cp->list); + put_pid_ns(cp->ns); + kfree(cp->tasks_pids); + kfree(cp); } up_write(&cgrp->pids_mutex); } static int cgroup_tasks_release(struct inode *inode, struct file *file) { - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct seq_file *seq; + struct cgroup_pids *cp; if (!(file->f_mode & FMODE_READ)) return 0; - release_cgroup_pid_array(cgrp); + seq = file->private_data; + cp = seq->private; + + release_cgroup_pid_array(cp); return seq_release(inode, file); } @@ -2324,6 +2353,8 @@ static struct file_operations cgroup_tasks_operations = { static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct pid_namespace *ns = current->nsproxy->pid_ns; + struct cgroup_pids *cp; pid_t *pidarray; int npids; int retval; @@ -2350,20 +2381,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * array if necessary */ down_write(&cgrp->pids_mutex); - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = pidarray; - cgrp->pids_length = npids; - cgrp->pids_use_count++; + + list_for_each_entry(cp, &cgrp->pids_list, list) { + if (ns == cp->ns) + goto found; + } + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + up_write(&cgrp->pids_mutex); + kfree(pidarray); + return -ENOMEM; + } + cp->cgrp = cgrp; + cp->ns = ns; + get_pid_ns(ns); + list_add(&cp->list, &cgrp->pids_list); +found: + kfree(cp->tasks_pids); + cp->tasks_pids = pidarray; + cp->length = npids; + cp->use_count++; up_write(&cgrp->pids_mutex); file->f_op = &cgroup_tasks_operations; retval = seq_open(file, &cgroup_tasks_seq_operations); if (retval) { - release_cgroup_pid_array(cgrp); + release_cgroup_pid_array(cp); return retval; } - ((struct seq_file *)file->private_data)->private = cgrp; + ((struct seq_file *)file->private_data)->private = cp; return 0; } -- cgit v1.1 From 887032670d47366a8c8f25396ea7c14b7b2cc620 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 29 Jul 2009 15:04:06 -0700 Subject: cgroup avoid permanent sleep at rmdir After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg) doesn't return -EBUSY by temporary ref counts. That commit expects all refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can wait permanently. This patch tries to fix that and change followings. - set CGRP_WAIT_ON_RMDIR flag before pre_destroy(). - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case. if there are sleeping ones, wakes them up. - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set. Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Reviewed-by: Paul Menage Acked-by: Balbir Sigh Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 55 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250dac0..b6eadfe 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * reference to css->refcnt. In general, this refcnt is expected to goes down * to zero, soon. * - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { - if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) wake_up_all(&cgroup_rmdir_waitq); } +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ + css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ + cgroup_wakeup_rmdir_waiter(css->cgroup); + css_put(css); +} + + static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * wake up rmdir() waiter. the rmdir should fail since the cgroup * is no longer empty. */ - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); return 0; } @@ -2744,33 +2756,42 @@ again: mutex_unlock(&cgroup_mutex); /* + * In general, subsystem has no css->refcnt after pre_destroy(). But + * in racy cases, subsystem may have to get css->refcnt after + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir + * and subsystem's reference count handling. Please see css_get/put + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + + /* * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ ret = cgroup_call_pre_destroy(cgrp); - if (ret) + if (ret) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); return ret; + } mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - /* - * css_put/get is provided for subsys to grab refcnt to css. In typical - * case, subsystem has no reference after pre_destroy(). But, under - * hierarchy management, some *temporal* refcnt can be hold. - * To avoid returning -EBUSY to a user, waitqueue is used. If subsys - * is really busy, it should return -EBUSY at pre_destroy(). wake_up - * is called when css_put() is called and refcnt goes down to 0. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); - schedule(); + /* + * Because someone may call cgroup_wakeup_rmdir_waiter() before + * prepare_to_wait(), we need to check this flag. + */ + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) + schedule(); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); if (signal_pending(current)) @@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); } -- cgit v1.1 From b62f495dad04fa94b5083aec638ff3072bccaaca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:04:09 -0700 Subject: profile: suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5..419250e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.1 From ec30c5f3a18722f8fcf8c83146a10b03ac4d9ff1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 28 Jul 2009 19:47:23 -0400 Subject: kprobes: Use kernel_text_address() for checking probe address Use kernel_text_address() for checking probe address instead of __kernel_text_address(), because __kernel_text_address() returns true for init functions even after relaseing those functions. That will hit a BUG() in text_poke(). Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739..0540948 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -694,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p) p->addr = addr; preempt_disable(); - if (!__kernel_text_address((unsigned long) p->addr) || + if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { preempt_enable(); return -EINVAL; -- cgit v1.1 From 0083fc2c50e6c5127c2802ad323adf8143ab7856 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 10:34:56 -0700 Subject: do_sigaltstack: avoid copying 'stack_t' as a structure to user space Ulrich Drepper correctly points out that there is generally padding in the structure on 64-bit hosts, and that copying the structure from kernel to user space can leak information from the kernel stack in those padding bytes. Avoid the whole issue by just copying the three members one by one instead, which also means that the function also can avoid the need for a stack frame. This also happens to match how we copy the new structure from user space, so it all even makes sense. [ The obvious solution of adding a memset() generates horrid code, gcc does really stupid things. ] Reported-by: Ulrich Drepper Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ccf1cee..f268372c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s stack_t oss; int error; - if (uoss) { - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); - } + oss.ss_sp = (void __user *) current->sas_ss_sp; + oss.ss_size = current->sas_ss_size; + oss.ss_flags = sas_ss_flags(sp); if (uss) { void __user *ss_sp; @@ -2501,13 +2499,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_size = ss_size; } + error = 0; if (uoss) { error = -EFAULT; - if (copy_to_user(uoss, &oss, sizeof(oss))) + if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) goto out; + error = __put_user(oss.ss_sp, &uoss->ss_sp) | + __put_user(oss.ss_size, &uoss->ss_size) | + __put_user(oss.ss_flags, &uoss->ss_flags); } - error = 0; out: return error; } -- cgit v1.1 From 0dd8486b5cfe8048e0613334659d9252ecd1b08a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 11:18:56 -0700 Subject: do_sigaltstack: small cleanups The previous commit ("do_sigaltstack: avoid copying 'stack_t' as a structure to user space") fixed a real bug. This one just cleans up the copy from user space to that gcc can generate better code for it (and so that it looks the same as the later copy back to user space). Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f268372c..64c5dee 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2464,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s int ss_flags; error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) - || __get_user(ss_sp, &uss->ss_sp) - || __get_user(ss_flags, &uss->ss_flags) - || __get_user(ss_size, &uss->ss_size)) + if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) + goto out; + error = __get_user(ss_sp, &uss->ss_sp) | + __get_user(ss_flags, &uss->ss_flags) | + __get_user(ss_size, &uss->ss_size); + if (error) goto out; error = -EPERM; -- cgit v1.1 From e53c0994709166b111fbe9162d1a16ece7dfc45b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 14:42:10 +0200 Subject: perf_counter: Collapse inherit on read() Currently the counter value returned by read() is the value of the parent counter, to which child counters are only fed back on child exit. Thus read() can return rather erratic (and meaningless) numbers depending on the state of the child processes. Change this by always iterating the full child hierarchy on read() and sum all counters. Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 9509310..48471d7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1688,6 +1688,18 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +static u64 perf_counter_read_tree(struct perf_counter *counter) +{ + struct perf_counter *child; + u64 total = 0; + + total += perf_counter_read(counter); + list_for_each_entry(child, &counter->child_list, child_list) + total += perf_counter_read(child); + + return total; +} + /* * Read the performance counter - simple non blocking version for now */ @@ -1707,7 +1719,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read(counter); + values[0] = perf_counter_read_tree(counter); n = 1; if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = counter->total_time_enabled + -- cgit v1.1 From 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 14:46:33 +0200 Subject: perf_counter: Full task tracing In order to be able to distinguish between no samples due to inactivity and no samples due to task ended, Arjan asked for PERF_EVENT_EXIT events. This is useful to the boot delay instrumentation (bootchart) app. This patch changes the PERF_EVENT_FORK to be emitted on every clone, and adds PERF_EVENT_EXIT to be emitted on task exit, after the task's counters have been closed. This task tracing is controlled through: attr.comm || attr.mmap and through the new attr.task field. Suggested-by: Arjan van de Ven Cc: Paul Mackerras Cc: Anton Blanchard Signed-off-by: Peter Zijlstra [ cleaned up perf_counter.h a bit ] Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 +-- kernel/perf_counter.c | 87 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e..466531e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + perf_counter_fork(p); return p; bad_fork_free_pid: @@ -1410,9 +1411,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if (!(clone_flags & CLONE_THREAD)) - perf_counter_fork(p); - audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48471d7..199ed47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly; /* * perf counter paranoia level: @@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_mmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); + if (counter->attr.task) + atomic_dec(&nr_task_counters); } if (counter->destroy) @@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter, } /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task */ -struct perf_fork_event { +struct perf_task_event { struct task_struct *task; struct { @@ -2842,37 +2847,42 @@ struct perf_fork_event { u32 pid; u32 ppid; + u32 tid; + u32 ptid; } event; }; -static void perf_counter_fork_output(struct perf_counter *counter, - struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, + struct perf_task_event *task_event) { struct perf_output_handle handle; - int size = fork_event->event.header.size; - struct task_struct *task = fork_event->task; + int size = task_event->event.header.size; + struct task_struct *task = task_event->task; int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; - fork_event->event.pid = perf_counter_pid(counter, task); - fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.pid = perf_counter_pid(counter, task); + task_event->event.ppid = perf_counter_pid(counter, task->real_parent); - perf_output_put(&handle, fork_event->event); + task_event->event.tid = perf_counter_tid(counter, task); + task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + + perf_output_put(&handle, task_event->event); perf_output_end(&handle); } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap) + if (counter->attr.comm || counter->attr.mmap || counter->attr.task) return 1; return 0; } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, - struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, + struct perf_task_event *task_event) { struct perf_counter *counter; @@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_fork_match(counter)) - perf_counter_fork_output(counter, fork_event); + if (perf_counter_task_match(counter)) + perf_counter_task_output(counter, task_event); } rcu_read_unlock(); } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event) */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_counter_fork_ctx(ctx, fork_event); + perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -void perf_counter_fork(struct task_struct *task) +static void perf_counter_task(struct task_struct *task, int new) { - struct perf_fork_event fork_event; + struct perf_task_event task_event; if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters)) + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_task_counters)) return; - fork_event = (struct perf_fork_event){ + task_event = (struct perf_task_event){ .task = task, .event = { .header = { - .type = PERF_EVENT_FORK, + .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, - .size = sizeof(fork_event.event), + .size = sizeof(task_event.event), }, /* .pid */ /* .ppid */ + /* .tid */ + /* .ptid */ }, }; - perf_counter_fork_event(&fork_event); + perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ + perf_counter_task(task, 1); } /* @@ -3887,6 +3905,8 @@ done: atomic_inc(&nr_mmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); + if (counter->attr.task) + atomic_inc(&nr_task_counters); } return counter; @@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) + if (likely(!child->perf_counter_ctxp)) { + perf_counter_task(child, 0); return; + } local_irq_save(flags); /* @@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the counters from it. */ unclone_ctx(child_ctx); - spin_unlock(&child_ctx->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the counters so that we + * won't get any samples after PERF_EVENT_EXIT. We can however still + * get a few PERF_EVENT_READ events. + */ + perf_counter_task(child, 0); + + child->perf_counter_ctxp = NULL; /* * We can recurse on the same lock type through: -- cgit v1.1 From e414314cce7539788dd5d2c35decad11782dd858 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 20:13:26 +0200 Subject: sched: Fix latencytop and sleep profiling vs group scheduling The latencytop and sleep accounting code assumes that any scheduler entity represents a task, this is not so. Cc: Arjan van de Ven Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9ffb2b2..652e8bd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -611,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -624,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - account_scheduler_latency(tsk, delta >> 10, 1); + if (tsk) + account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -639,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->block_start = 0; se->sum_sleep_runtime += delta; - /* - * Blocking time is in units of nanosecs, so shift by 20 to - * get a milliseconds-range estimation of the amount of - * time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - - profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), - delta >> 20); + if (tsk) { + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); } - account_scheduler_latency(tsk, delta >> 10, 0); } #endif } -- cgit v1.1 From 07903af152b0597d94e9b0030746b63c4664e787 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:28 -0400 Subject: sched: Fix race in cpupri introduced by cpumask_var changes Background: Several race conditions in the scheduler have cropped up recently, which Steven and I have tracked down using ftrace. The most recent one turns out to be a race in how the scheduler determines a suitable migration target for RT tasks, introduced recently with commit: commit 68e74568fbe5854952355e942acca51f138096d9 Date: Tue Nov 25 02:35:13 2008 +1030 sched: convert struct cpupri_vec cpumask_var_t. The original design of cpupri allowed lockless readers to quickly determine a best-estimate target. Races between the pri_active bitmap and the vec->mask were handled in the original code because we would detect and return "0" when this occured. The design was predicated on the *effective* atomicity (*) of caching the result of cpus_and() between the cpus_allowed and the vec->mask. Commit 68e74568 changed the behavior such that vec->mask is accessed multiple times. This introduces a subtle race, the result of which means we can have a result that returns "1", but with an empty bitmap. *) yes, we know cpus_and() is not a locked operator across the entire composite array, but it is implicitly atomic on a per-word basis which is all the design required to work. Implementation: Rather than forgoing the lockless design, or reverting to a stack-based cpumask_t, we simply check for when the race has been encountered and continue processing in the event that the race is hit. This renders the removal race as if the priority bit had been atomically cleared as well, and allows the algorithm to execute correctly. Signed-off-by: Gregory Haskins CC: Rusty Russell CC: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145728.25226.92769.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6c2517..d014efb 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - if (lowest_mask) + if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_any(lowest_mask) >= nr_cpu_ids) + continue; + } + return 1; } -- cgit v1.1 From 70d715fd0597f18528f389b5ac59102263067744 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 3 Aug 2009 11:48:19 +0900 Subject: posix-timers: Fix oops in clock_nanosleep() with CLOCK_MONOTONIC_RAW Prevent calling do_nanosleep() with clockid CLOCK_MONOTONIC_RAW, it may cause oops, such as NULL pointer dereference. Signed-off-by: Hiroshi Shimamoto Cc: Andrew Morton Cc: Thomas Gleixner Cc: John Stultz Cc: LKML-Reference: <4A764FF3.50607@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/posix-timers.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 052ec4d..d089d05 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer) return -EOPNOTSUPP; } +static int no_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) +{ + return -EOPNOTSUPP; +} + /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -254,6 +260,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_raw, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, + .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); -- cgit v1.1 From 0f2541d299d233eddddee4345795e0c46264fd56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 12:02:48 -0400 Subject: ring-buffer: fix check of try_to_discard result The function ring_buffer_discard_commit inversed the code path of the result of try_to_discard. It should skip incrementing the entry counter if try_to_discard succeeded. But instead, it increments the entry conder if it succeeded to discard, and does not increment it if it fails. The result of this bug is that filtering will make the stat counters incorrect. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7..2fd1752 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1785,7 +1785,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - if (!rb_try_to_discard(cpu_buffer, event)) + if (rb_try_to_discard(cpu_buffer, event)) goto out; /* -- cgit v1.1 From 464e85eb0e63096bd52e4c3e2a6fb8357fb95828 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 15:26:37 -0400 Subject: ring-buffer: do not disable ring buffer on oops_in_progress The commit: commit e0fdace10e75dac67d906213b780ff1b1a4cc360 Author: David Miller Date: Fri Aug 1 01:11:22 2008 -0700 debug_locks: set oops_in_progress if we will log messages. Otherwise lock debugging messages on runqueue locks can deadlock the system due to the wakeups performed by printk(). Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Will permanently set oops_in_progress on any lockdep failure. When this triggers it will cause any read from the ring buffer to permanently disable the ring buffer (not to mention no locking of printk). This patch removes the check. It keeps the print in NMI which makes sense. This is probably OK, since the ring buffer should not cause something to set oops_in_progress anyway. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2fd1752..2606cee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void) * buffer too. A one time deal is all you get from reading * the ring buffer from an NMI. */ - if (likely(!in_nmi() && !oops_in_progress)) + if (likely(!in_nmi())) return 1; tracing_off_permanent(); -- cgit v1.1 From 1bbf20835c4e088667a090ce6523a0f70b62dc76 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 5 Aug 2009 12:05:21 -0700 Subject: rtmutex: Avoid deadlock in rt_mutex_start_proxy_lock() In the event of a lock steal or owner died, rt_mutex_start_proxy_lock() will give the rt_mutex to the waiting task, but it fails to release the wait_lock. This leads to subsequent deadlocks when other tasks try to acquire the rt_mutex. I also removed a few extra blank lines that really spaced this routine out. I must have been high on the \n when I wrote this originally... Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A79D7F1.4000405@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index fcd107a..29bd4ba 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { /* We got the lock for task. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); - + spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { /* * Reset the return value. We might have -- cgit v1.1 From af6af30c0fcd77e621638e53ef8b176bca8bd3b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Aug 2009 20:41:04 +0200 Subject: ftrace: Fix perf-tracepoint OOPS Not all tracepoints are created equal, in specific the ftrace tracepoints are created with TRACE_EVENT_FORMAT() which does not generate the needed bits to tie them into perf counters. For those events, don't create the 'id' file and fail ->profile_enable when their ID is specified through other means. Reported-by: Chris Mason Signed-off-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1249497664.5890.4.camel@laptop> [ v2: fix build error in the !CONFIG_EVENT_PROFILE case ] Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_profile.c | 2 +- kernel/trace/trace_events.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895a..11ba5bb 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { + if (event->id == event_id && event->profile_enable) { ret = event->profile_enable(event); break; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 23d2972..e75276a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, entry = trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id) + if (call->id && call->profile_enable) entry = trace_create_file("id", 0444, call->dir, call, id); -- cgit v1.1 From 469535a598f28c13a2a42037e1b778f671af1d16 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 30 Jul 2009 19:19:18 +0200 Subject: ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [] ? rb_advance_reader+0x2e/0xc5 [] warn_slowpath_common+0x77/0x8f [] warn_slowpath_null+0xf/0x11 [] rb_advance_reader+0x2e/0xc5 [] ring_buffer_consume+0xa0/0xd2 [] op_cpu_buffer_read_entry+0x21/0x9e [] ? __find_get_block+0x4b/0x165 [] sync_buffer+0xa5/0x401 [] ? __find_get_block+0x4b/0x165 [] ? wq_sync_buffer+0x0/0x78 [] wq_sync_buffer+0x5b/0x78 [] worker_thread+0x113/0x1ac [] ? autoremove_wake_function+0x0/0x38 [] ? worker_thread+0x0/0x1ac [] kthread+0x88/0x92 [] child_rip+0xa/0x20 [] ? kthread+0x0/0x92 [] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt Cc: Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2606cee..d4d3580 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2383,7 +2383,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * the box. Return the padding, and we will release * the current locks, and try again. */ - rb_advance_reader(cpu_buffer); return event; case RINGBUF_TYPE_TIME_EXTEND: @@ -2519,6 +2518,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); @@ -2590,12 +2591,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out_unlock; - - rb_advance_reader(cpu_buffer); + if (event) + rb_advance_reader(cpu_buffer); - out_unlock: if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); -- cgit v1.1 From 1054598cab8674438675085fae459e960eb10799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 18:06:26 +0200 Subject: perf_counter: Fix double list iteration in per task precise stats Brice Goglin reported this crash with per task precise stats: > I finally managed to test the threaded perfcounter statistics (thanks a > lot for implementing it). I am running 2.6.31-rc5 (with the AMD > magny-cours patches but I don't think they matter here). I am trying to > measure local/remote memory accesses per thread during the well-known > stream benchmark. It's compiled with OpenMP using 16 threads on a > quad-socket quad-core barcelona machine. > > Command line is: > /mnt/scratch/bgoglin/cpunode/linux-2.6.31/tools/perf/perf record -f -s > -e r1000001e0 -e r1000002e0 -e r1000004e0 -e r1000008e0 ./stream > > It seems to work fine with a single -e on the command line > while it crashes when there are at least 2 of them. > It seems to work fine without -s as well. A silly copy-paste resulted in a messed up iteration which would cause the OOPS. Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin LKML-Reference: <1249574786.32113.550.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 199ed47..673c1aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1104,7 +1104,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx, __perf_counter_sync_stat(counter, next_counter); counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(counter, event_entry); + next_counter = list_next_entry(next_counter, event_entry); } } -- cgit v1.1 From 9795447f71324d8f14c19ed68b43c883135c3f59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 16:37:10 +0800 Subject: lockdep: Fix file mode of lock_stat /proc/lock_stat is writable. Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A7BE7B6.10904@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa..e94caa6 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void) &proc_lockdep_stats_operations); #ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, + &proc_lock_stat_operations); #endif return 0; -- cgit v1.1 From bd3f02212d6a457267e0c9c02c426151c436d9d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Aug 2009 12:49:29 +0200 Subject: ring-buffer: Fix memleak in ring_buffer_free() I noticed oprofile memleaked in linux-2.6 current tree, and tracked this ring-buffer leak. Signed-off-by: Eric Dumazet LKML-Reference: <4A7C06B9.2090302@gmail.com> Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4d3580..a330513 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer) put_online_cpus(); + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); -- cgit v1.1 From 69dd647f969c28d18de77e2153f30d05a1874571 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 6 Aug 2009 15:07:29 -0700 Subject: generic-ipi: fix hotplug_cfd() Use CONFIG_HOTPLUG_CPU, not CONFIG_CPU_HOTPLUG When hot-unpluging a cpu, it will leak memory allocated at cpu hotplug, but only if CPUMASK_OFFSTACK=y, which is default to n. The bug was introduced by 8969a5ede0f9e17da4b943712429aef2c9bcd82b ("generic-ipi: remove kmalloc()"). Signed-off-by: Xiao Guangrong Cc: Ingo Molnar Cc: Jens Axboe Cc: Nick Piggin Cc: Peter Zijlstra Cc: Rusty Russell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ad63d85..94188b8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_BAD; break; -#ifdef CONFIG_CPU_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: -- cgit v1.1 From 9c8a8228d0827e0d91d28527209988f672f97d28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Aug 2009 15:09:28 -0700 Subject: execve: must clear current->clear_child_tid While looking at Jens Rosenboom bug report (http://lkml.org/lkml/2009/7/27/35) about strange sys_futex call done from a dying "ps" program, we found following problem. clone() syscall has special support for TID of created threads. This support includes two features. One (CLONE_CHILD_SETTID) is to set an integer into user memory with the TID value. One (CLONE_CHILD_CLEARTID) is to clear this same integer once the created thread dies. The integer location is a user provided pointer, provided at clone() time. kernel keeps this pointer value into current->clear_child_tid. At execve() time, we should make sure kernel doesnt keep this user provided pointer, as full user memory is replaced by a new one. As glibc fork() actually uses clone() syscall with CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID set, chances are high that we might corrupt user memory in forked processes. Following sequence could happen: 1) bash (or any program) starts a new process, by a fork() call that glibc maps to a clone( ... CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID ...) syscall 2) When new process starts, its current->clear_child_tid is set to a location that has a meaning only in bash (or initial program) context (&THREAD_SELF->tid) 3) This new process does the execve() syscall to start a new program. current->clear_child_tid is left unchanged (a non NULL value) 4) If this new program creates some threads, and initial thread exits, kernel will attempt to clear the integer pointed by current->clear_child_tid from mm_release() : if (tsk->clear_child_tid && !(tsk->flags & PF_SIGNALED) && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ << here >> put_user(0, tidptr); sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } 5) OR : if new program is not multi-threaded, but spied by /proc/pid users (ps command for example), mm_users > 1, and the exiting program could corrupt 4 bytes in a persistent memory area (shm or memory mapped file) If current->clear_child_tid points to a writeable portion of memory of the new program, kernel happily and silently corrupts 4 bytes of memory, with unexpected effects. Fix is straightforward and should not break any sane program. Reported-by: Jens Rosenboom Acked-by: Linus Torvalds Signed-off-by: Eric Dumazet Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Sonny Rao Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ulrich Drepper Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 466531e..021e113 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -568,18 +568,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * the value intact in a core dump, and to save the unnecessary * trouble otherwise. Userland only wants this done for a sys_exit. */ - if (tsk->clear_child_tid - && !(tsk->flags & PF_SIGNALED) - && atomic_read(&mm->mm_users) > 1) { - u32 __user * tidptr = tsk->clear_child_tid; + if (tsk->clear_child_tid) { + if (!(tsk->flags & PF_SIGNALED) && + atomic_read(&mm->mm_users) > 1) { + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tsk->clear_child_tid); + sys_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0); + } tsk->clear_child_tid = NULL; - - /* - * We don't check the error code - if userspace has - * not set up a proper pointer then tough luck. - */ - put_user(0, tidptr); - sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } } -- cgit v1.1 From ad7d6c7a0654a4bbda3e109f56af713267e96274 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 09:01:33 -0700 Subject: x86/irq: Fix move_irq_desc() for nodes without ram Don't move it if target node is -1. Signed-off-by: Yinghai Lu LKML-Reference: <4A785B5D.4070702@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee..3fd3019 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock: struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - /* those all static, do move them */ - if (desc->irq < NR_IRQS_LEGACY) + /* those static or target node is -1, do not move them */ + if (desc->irq < NR_IRQS_LEGACY || node == -1) return desc; if (desc->node != node) -- cgit v1.1 From 96b2de313b1e0e02aea80ee47df6a2b5cbdf8e13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:09 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. I noticed the same problem also existed for the create_pred() case and added a fix for that. Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746549.6453.29.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621..1557148 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1029,6 +1029,8 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); @@ -1048,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); -- cgit v1.1 From 26528e773ecc74fb1b61b7275f86f761cbb340ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:53 -0500 Subject: tracing/filters: Always free pred on filter_add_subsystem_pred() failure If filter_add_subsystem_pred() fails due to ENOSPC or ENOMEM, the pred doesn't get freed, while as a side effect it does for other errors. Make it so the caller always frees the pred for any error. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746593.6453.32.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1557148..f32dc9d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, return -ENOSPC; } - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, } replace_filter_string(call->filter, filter_string); } + + filter->preds[filter->n_preds] = pred; + filter->n_preds++; out: return err; } @@ -1034,9 +1034,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; @@ -1055,9 +1058,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; -- cgit v1.1 From 17d42c1c497aa54952b9e58c1502a46f0df40315 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 6 Aug 2009 16:03:30 -0700 Subject: posix_cpu_timers_exit_group(): Do not use thread_group_cputimer() When the process exits we don't have to run new cputimer nor use running one (as it not accounts when tsk->exit_state != 0) to get process CPU times. As there is only one thread we can just use CPU times fields from task and signal structs. Signed-off-by: Stanislaw Gruszka Cc: Peter Zijlstra Cc: Roland McGrath Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0..e33a21c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - struct task_cputime cputime; + struct signal_struct *const sig = tsk->signal; - thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, - cputime.utime, cputime.stime, cputime.sum_exec_runtime); + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), + tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -- cgit v1.1 From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 673c1aa..52eb4b6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count) { struct perf_sample_data data = { .regs = get_irq_regs(), - .addr = 0, + .addr = addr, }; if (!data.regs) data.regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.1 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 18 +++++++++++++++++- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 4 ---- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b6..8681021 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39..c22b40f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5..8b9f4f6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); -- cgit v1.1 From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 09:29:32 +0200 Subject: perf_counter: Fix software counters for fast moving event sources Reimplement the software counters to deal with fast moving event sources (such as tracepoints). This means being able to generate multiple overflows from a single 'event' as well as support throttling. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 164 +++++++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8681021..615440a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) - goto again; + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; - delta = now - prev; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); + return nr; } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; + u64 overflow; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - } + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - hwc->last_period = period; - } + if (hwc->interrupts == MAX_INTERRUPTS) + return; - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. + * Nothing to do, we already reset hwc->interrupts. */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); +} - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + atomic64_add(nr, &counter->count); - return ret; -} + if (!hwc->sample_period) + return; -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - data->period = counter->hw.last_period; + if (!data->regs) + return; - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, data)) - /* soft-disable the counter */ - ; + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - - if (counter->hw.sample_period && !neg && data->regs) - perf_swcounter_overflow(counter, nmi, data); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, @@ -3575,27 +3560,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; /* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* * Software counter: cpu wall time clock */ -- cgit v1.1 From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:35 +0200 Subject: perf_counter: Work around gcc warning by initializing tracepoint record unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite that the tracepoint record is always present when the PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning, thinking it might not be initialized: kernel/perf_counter.c: In function ‘perf_counter_output’: kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function Then, initialize it to NULL and always check if it's not NULL before dereference it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 615440a..117622c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp; + struct perf_tracepoint_record *tp = NULL; int callchain_size = 0; u64 time; struct { @@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_TP_RECORD) { tp = data->private; - header.size += tp->size; + if (tp) + header.size += tp->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (sample_type & PERF_SAMPLE_TP_RECORD) + if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) perf_output_copy(&handle, tp->record, tp->size); perf_output_end(&handle); -- cgit v1.1 From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622c..0023105 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit v1.1 From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2009 19:49:01 +0200 Subject: perf_counter: Fix a race on perf_counter_ctx While extending perfcounters with BTS hw-tracing, Markus Metzger managed to trigger this warning: [ 995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b() triggers because commit 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full task tracing) removed clearing of tsk->perf_counter_ctxp out from under ctx->lock which introduced a race (against perf_lock_task_context). Move it back and deal with the exit notification by explicitly passing along the former task context. Reported-by: Markus T Metzger Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1249667341.17467.5.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0023105..546e62d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter, */ struct perf_task_event { - struct task_struct *task; + struct task_struct *task; + struct perf_counter_context *task_ctx; struct { struct perf_event_header header; @@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx, static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_counter_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); if (ctx) perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, int new) +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) { struct perf_task_event task_event; @@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new) return; task_event = (struct perf_task_event){ - .task = task, - .event = { + .task = task, + .task_ctx = task_ctx, + .event = { .header = { .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, @@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new) void perf_counter_fork(struct task_struct *task) { - perf_counter_task(task, 1); + perf_counter_task(task, NULL, 1); } /* @@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child) unsigned long flags; if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, 0); + perf_counter_task(child, NULL, 0); return; } @@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child) * won't get any samples after PERF_EVENT_EXIT. We can however still * get a few PERF_EVENT_READ events. */ - perf_counter_task(child, 0); - - child->perf_counter_ctxp = NULL; + perf_counter_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: -- cgit v1.1 From beda2c7ea2c15ed01eef00a997d2b0496c3a502d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Sun, 9 Aug 2009 15:34:39 -0700 Subject: futex: Update futex_q lock_ptr on requeue proxy lock futex_requeue() can acquire the lock on behalf of a waiter early on or during the requeue loop if it is uncontended or in the event of a lock steal or owner died. On wakeup, the waiter (in futex_wait_requeue_pi()) cleans up the pi_state owner using the lock_ptr to protect against concurrent access to the pi_state. The pi_state is hung off futex_q's on the requeue target futex hash bucket so the lock_ptr needs to be updated accordingly. The problem manifested by triggering the WARN_ON in lookup_pi_state() about the pid != pi_state->owner->pid. With this patch, the pi_state is properly guarded against concurrent access via the requeue target hb lock. The astute reviewer may notice that there is a window of time between when futex_requeue() unlocks the hb locks and when futex_wait_requeue_pi() will acquire hb2->lock. During this time the pi_state and uval are not in sync with the underlying rtmutex owner (but the uval does indicate there are waiters, so no atomic changes will occur in userspace). However, this is not a problem. Should a contending thread enter lookup_pi_state() and acquire hb2->lock before the ownership is fixed up, it will find the pi_state hung off a waiter's (possibly the pending owner's) futex_q and block on the rtmutex. Once futex_wait_requeue_pi() fixes up the owner, it will also move the pi_state from the old owner's task->pi_state_list to its own. v3: Fix plist lock name for application to mainline (rather than -rt) Compile tested against tip/v2.6.31-rc5. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7F4EFF.6090903@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff8..8cc3ee1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex + * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); @@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; + q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + wake_up_state(q->task, TASK_NORMAL); } @@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1273,7 +1282,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.1 From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d..5229d16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit v1.1 From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:20:12 +0200 Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data Raw tracepoint data contains various kernel internals and data from other users, so restrict this to CAP_SYS_ADMIN. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896452.17467.75.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5229d16..b0b20a0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (ftrace_profile_enable(counter->attr.config)) return NULL; -- cgit v1.1 From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- kernel/wait.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4..c4bd3d8 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@ #include #include -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head); void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { -- cgit v1.1 From 4dc88029fd916b860ef063c40180aa604ce93494 Mon Sep 17 00:00:00 2001 From: Dinakar Guniguntala Date: Mon, 10 Aug 2009 18:31:42 +0530 Subject: futex: Fix compat_futex to be same as futex for REQUEUE_PI Need to add the REQUEUE_PI checks to the compat_sys_futex API as well to ensure 32 bit requeue's work fine on a 64 bit system. Patch is against latest tip Signed-off-by: Dinakar Guniguntala Cc: Darren Hart LKML-Reference: <20090810130142.GA23619@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b..2357165 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = ktime_add_safe(ktime_get(), t); tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -- cgit v1.1 From 392741e0a4e17c82e3978b7fcbf04291294dc0a1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 7 Aug 2009 15:20:48 -0700 Subject: futex: Fix handling of bad requeue syscall pairing If futex_requeue(requeue_pi=1) finds a futex_q that was created by a call other the futex_wait_requeue_pi(), the q.rt_waiter may be null. If so, this will result in an oops from the following call graph: futex_requeue() rt_mutex_start_proxy_lock() task_blocks_on_rt_mutex() waiter->task dereference OOPS We currently WARN_ON() if this is detected, clearly this is inadequate. If we detect a mispairing in futex_requeue(), bail out, seding -EINVAL to user-space. V2: Fix parenthesis warnings. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Steven Rostedt Cc: John Kacur Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7CA8C0.7010809@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8cc3ee1..e18cfbd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1256,8 +1256,15 @@ retry_private: if (!match_futex(&this->key, &key1)) continue; - WARN_ON(!requeue_pi && this->rt_waiter); - WARN_ON(requeue_pi && !this->rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } /* * Wake nr_wake waiters. For requeue_pi, if we acquired the -- cgit v1.1 From 39cbb602b543e477df71dca84b5b2e36f8bd29fc Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Fri, 7 Aug 2009 12:01:08 -0400 Subject: Remove double removal of blktrace directory commit fd51d251e4cdb21f68e9dbc4336514d64a105a79 Author: Stefan Raspl Date: Tue May 19 09:59:08 2009 +0200 blktrace: remove debugfs entries on bad path added in an explicit invocation of debugfs_remove for bt->dir, in blk_remove_buf_file_callback we are also getting the directory removed. On occasion I am seeing memory corruption that I have bisected down to this commit. [The testing involves a (long) series of I/O benchmarks with blktrace invoked around the actual runs.] I believe that this committed patch is correct, but the problem actually lies in the code in blk_remove_buf_file_callback. With this patch I am able to consistently get complete runs whereas previously I could not get a single run to complete. The first part of the patch simply moves the debugfs_remove below the relay_close: the relay_close call will remove files under bt->dir, and so we should not remove the directory until all the files we created have been removed. (Note: This is not sufficient to fix the problem - the file system code has ref counts on the directoy, so our invocation does not cause the directory to actually be removed. Nonetheless, we should not rely upon that feature.) Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1090b0a..7a34cb5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); - debugfs_remove(bt->dir); relay_close(bt->rchan); + debugfs_remove(bt->dir); free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, static int blk_remove_buf_file_callback(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; debugfs_remove(dentry); - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); return 0; } -- cgit v1.1 From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0b20a0..e26d2fc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4616,6 +4622,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.1 From bcfc2602e8541ac13b1def38e2591dca072cff7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 09:51:55 +0200 Subject: perf_counter: Fix swcounter context invariance perf_swcounter_is_counting() uses a lock, which means we cannot use swcounters from NMI or when holding that particular lock, this is unintended. The below removes the lock, this opens up race window, but not worse than the swcounters already experience due to RCU traversal of the context in perf_swcounter_ctx_event(). This also fixes the hard lockups while opening a lockdep tracepoint counter. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Paul Mackerras Cc: stephane eranian Cc: Corey J Ashford LKML-Reference: <1250149915.10001.66.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e26d2fc..3dd4339 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3444,40 +3444,32 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, static int perf_swcounter_is_counting(struct perf_counter *counter) { - struct perf_counter_context *ctx; - unsigned long flags; - int count; - + /* + * The counter is active, we're good! + */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) return 1; + /* + * The counter is off/error, not counting. + */ if (counter->state != PERF_COUNTER_STATE_INACTIVE) return 0; /* - * If the counter is inactive, it could be just because - * its task is scheduled out, or because it's in a group - * which could not go on the PMU. We want to count in - * the first case but not the second. If the context is - * currently active then an inactive software counter must - * be the second case. If it's not currently active then - * we need to know whether the counter was active when the - * context was last active, which we can determine by - * comparing counter->tstamp_stopped with ctx->time. - * - * We are within an RCU read-side critical section, - * which protects the existence of *ctx. + * The counter is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. */ - ctx = counter->ctx; - spin_lock_irqsave(&ctx->lock, flags); - count = 1; - /* Re-check state now we have the lock */ - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->ctx->is_active || - counter->tstamp_stopped < ctx->time) - count = 0; - spin_unlock_irqrestore(&ctx->lock, flags); - return count; + if (counter->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v1.1 From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 274 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 202 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3dd4339..b8c6b97 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -static u64 perf_counter_read_tree(struct perf_counter *counter) +static int perf_counter_read_size(struct perf_counter *counter) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) { struct perf_counter *child; u64 total = 0; @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter) return total; } +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + /* * Read the performance counter - simple non blocking version for now */ static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[4]; - int n; + u64 read_format = counter->attr.read_format; + int ret; /* * Return end-of-file for a read on a counter that is in @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read_tree(counter); - n = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); mutex_unlock(&counter->child_mutex); - if (count < n * sizeof(u64)) - return -EINVAL; - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; + return ret; } static ssize_t @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); +} + void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi, struct { u32 pid, tid; } tid_entry; - struct { - u64 id; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - if (sample_type & PERF_SAMPLE_GROUP) { - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(data->regs); @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(&handle, data->period); - /* - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. - */ - if (sample_type & PERF_SAMPLE_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; - - perf_output_put(&handle, nr); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); - - group_entry.id = primary_counter_id(sub); - group_entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, group_entry); - } - } + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (callchain) @@ -2818,8 +2964,6 @@ struct perf_read_event { u32 pid; u32 tid; - u64 value; - u64 format[3]; }; static void @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter, .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) - sizeof(event.format), + .size = sizeof(event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), - .value = atomic64_read(&counter->count), }; - int ret, i = 0; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_enabled; - } - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_running; - } - - if (counter->attr.read_format & PERF_FORMAT_ID) { - event.header.size += sizeof(u64); - event.format[i++] = primary_counter_id(counter); - } + int ret; ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); if (ret) return; - perf_output_copy(&handle, &event, event.header.size); + perf_output_put(&handle, event); + perf_output_read(&handle, counter); + perf_output_end(&handle); } @@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_SAMPLE_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; switch (attr->type) { -- cgit v1.1 From 970892a9031a5dc7217bd394fb9d89fa75a4a7bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:54 +0200 Subject: perf_counter: Fix an ipi-deadlock perf_pending_counter() is called from IRQ context and will call perf_counter_disable(), however perf_counter_disable() uses smp_call_function_single() which doesn't fancy being used with IRQs disabled due to IPI deadlocks. Fix this by making it use the local __perf_counter_disable() call and teaching the counter_sched_out() code about pending disables as well. This should cover the case where a counter migrates before the pending queue gets processed. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.244097721@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b8c6b97..3f841be 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + if (counter->pending_disable) { + counter->pending_disable = 0; + counter->state = PERF_COUNTER_STATE_OFF; + } counter->tstamp_stopped = ctx->time; counter->pmu->disable(counter); counter->oncpu = -1; @@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry) if (counter->pending_disable) { counter->pending_disable = 0; - perf_counter_disable(counter); + __perf_counter_disable(counter); } if (counter->pending_wakeup) { -- cgit v1.1 From 94d5d1b2d891f1fd5205f978246b7864d998b25c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 16:14:42 +0200 Subject: perf_counter: Report the cloning task as parent on perf_counter_fork() A bug in (9f498cc: perf_counter: Full task tracing) makes profiling multi-threaded apps it go belly up. [ output as: (PID:TID):(PPID:PTID) ] # ./perf report -D | grep FORK 0x4b0 [0x18]: PERF_EVENT_FORK: (3237:3237):(3236:3236) 0xa10 [0x18]: PERF_EVENT_FORK: (3237:3238):(3236:3236) 0xa70 [0x18]: PERF_EVENT_FORK: (3237:3239):(3236:3236) 0xad0 [0x18]: PERF_EVENT_FORK: (3237:3240):(3236:3236) 0xb18 [0x18]: PERF_EVENT_FORK: (3237:3241):(3236:3236) Shows us that the test (27d028d perf report: Update for the new FORK/EXIT events) in builtin-report.c: /* * A thread clone will have the same PID for both * parent and child. */ if (thread == parent) return 0; Will clearly fail. The problem is that perf_counter_fork() reports the actual parent, instead of the cloning thread. Fixing that (with the below patch), yields: # ./perf report -D | grep FORK 0x4c8 [0x18]: PERF_EVENT_FORK: (1590:1590):(1589:1589) 0xbd8 [0x18]: PERF_EVENT_FORK: (1590:1591):(1590:1590) 0xc80 [0x18]: PERF_EVENT_FORK: (1590:1592):(1590:1590) 0x3338 [0x18]: PERF_EVENT_FORK: (1590:1593):(1590:1590) 0x66b0 [0x18]: PERF_EVENT_FORK: (1590:1594):(1590:1590) Which both makes more sense and doesn't confuse perf report anymore. Reported-by: Pekka Enberg Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: Anton Blanchard Cc: Arjan van de Ven LKML-Reference: <1250172882.5241.62.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f841be..534e20d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3028,10 +3028,10 @@ static void perf_counter_task_output(struct perf_counter *counter, return; task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.ppid = perf_counter_pid(counter, current); task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + task_event->event.ptid = perf_counter_tid(counter, current); perf_output_put(&handle, task_event->event); perf_output_end(&handle); -- cgit v1.1 From 2d860ad76f4ee4d2eba0fe3797c8d7cdce432cc0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 13:05:10 -0700 Subject: genirq: prevent wakeup of freed irq thread free_irq() can remove an irqaction while the corresponding interrupt is in progress, but free_irq() sets action->thread to NULL unconditionally, which might lead to a NULL pointer dereference in handle_IRQ_event() when the hard interrupt context tries to wake up the handler thread. Prevent this by moving the thread stop after synchronize_irq(). No need to set action->thread to NULL either as action is going to be freed anyway. This fixes a boot crash reported against preempt-rt which uses the mainline irq threads code to implement full irq threading. [ tglx: removed local irqthread variable ] Signed-off-by: Linus Torvalds Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c679d..d222515 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -761,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; - struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -809,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - irqthread = action->thread; - action->thread = NULL; - spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -819,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); - if (irqthread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(irqthread); - put_task_struct(irqthread); - } - #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -840,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) local_irq_restore(flags); } #endif + + if (action->thread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(action->thread); + put_task_struct(action->thread); + } + return action; } -- cgit v1.1 From 788084aba2ab7348257597496befcbccabdc98a3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:54:11 -0400 Subject: Security/SELinux: seperate lsm specific mmap_min_addr Currently SELinux enforcement of controls on the ability to map low memory is determined by the mmap_min_addr tunable. This patch causes SELinux to ignore the tunable and instead use a seperate Kconfig option specific to how much space the LSM should protect. The tunable will now only control the need for CAP_SYS_RAWIO and SELinux permissions will always protect the amount of low memory designated by CONFIG_LSM_MMAP_MIN_ADDR. This allows users who need to disable the mmap_min_addr controls (usual reason being they run WINE as a non-root user) to do so and still have SELinux controls preventing confined domains (like a web server) from being able to map some area of low memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- kernel/sysctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e0232..58be760 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", - .data = &mmap_min_addr, - .maxlen = sizeof(unsigned long), + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { -- cgit v1.1 From e1ac3614ff606ae03677f47459113f98a19af63c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 14 Aug 2009 15:39:10 +1000 Subject: perf_counter: Check task on counter read IPI In general, code in perf_counter.c that is called through an IPI checks, for per-task counters, that the counter's task is still the current task. This is to handle the race condition where the cpu switches from the task we want to another task in the interval between sending the IPI and the IPI arriving and being handled on the target CPU. For some reason, __perf_counter_read is missing this check, yet there is no reason why the race condition can't occur. This adds a check that the current task is the one we want. If it isn't, we just return. In that case the counter->count value should be up to date, since it will have been updated when the counter was scheduled out, which must have happened since the IPI was sent. I don't have an example of an actual failure due to this race, but it seems obvious that it could occur and we need to guard against it. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <19076.63614.277861.368125@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 534e20d..b8fe739 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1503,10 +1503,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task) */ static void __perf_counter_read(void *info) { + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; unsigned long flags; + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * counter->count would have been updated to a recent sample + * when the counter was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); -- cgit v1.1 From f738eb1b63edf664da1b4ac76895d988749b2f07 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Aug 2009 11:32:24 +0200 Subject: perf_counter: Fix the PARISC build PARISC does not build: /home/mingo/tip/kernel/perf_counter.c: In function 'perf_counter_index': /home/mingo/tip/kernel/perf_counter.c:2016: error: 'PERF_COUNTER_INDEX_OFFSET' undeclared (first use in this function) /home/mingo/tip/kernel/perf_counter.c:2016: error: (Each undeclared identifier is reported only once /home/mingo/tip/kernel/perf_counter.c:2016: error: for each function it appears in.) As PERF_COUNTER_INDEX_OFFSET is not defined. Now, we could define it in the architecture - but lets also provide a core default of 0 (which happens to be what all but one architecture uses at the moment). Architectures that need a different index offset should set this value in their asm/perf_counter.h files. Cc: Kyle McMartin Cc: Helge Deller Cc: linux-parisc@vger.kernel.org Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b8fe739..36f65e2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2019,6 +2019,10 @@ int perf_counter_task_disable(void) return 0; } +#ifndef PERF_COUNTER_INDEX_OFFSET +# define PERF_COUNTER_INDEX_OFFSET 0 +#endif + static int perf_counter_index(struct perf_counter *counter) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) -- cgit v1.1 From 69ab849439b506cd8dd2879527fdb64d95dd5211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Aug 2009 14:07:16 +0200 Subject: genirq: Wake up irq thread after action has been installed The wake_up_process() of the new irq thread in __setup_irq() is too early as the irqaction is not yet fully initialized especially action->irq is not yet set. The interrupt thread might dereference the wrong irq descriptor. Move the wakeup after the action is installed and action->irq has been set. Reported-by: Michael Buesch Signed-off-by: Thomas Gleixner Tested-by: Michael Buesch --- kernel/irq/manage.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d222515..0ec9ed8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -607,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ get_task_struct(t); new->thread = t; - wake_up_process(t); } /* @@ -690,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) (int)(new->flags & IRQF_TRIGGER_MASK)); } + new->irq = irq; *old_ptr = new; /* Reset broken irq detection when installing new handler */ @@ -707,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) spin_unlock_irqrestore(&desc->lock, flags); - new->irq = irq; + /* + * Strictly no need to wake it up, but hung_task complains + * when no hard interrupt wakes the thread up. + */ + if (new->thread) + wake_up_process(new->thread); + register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); -- cgit v1.1 From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 021e113..144326b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; - mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.1