From e7e2ee89a9dbf48d70a922d5625cd7320a27cbff Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 10 May 2011 13:27:21 -0700 Subject: tracing: Schedule a delayed work to call wakeup() In using syscall tracing by concurrent processes, the wakeup() that is called in the event commit function causes contention on the spin lock of the waitqueue. I enabled sys_enter_getuid and sys_exit_getuid tracepoints, and by running getuid_microbench from autotest in parallel I found that the contention causes exponential latency increase in the tracing path. The autotest binary getuid_microbench calls getuid() in a tight loop for the given number of iterations and measures the average time required to complete a single invocation of syscall. The patch schedules a delayed work after 2 ms once an event commit calls to wake up the trace wait_queue. This removes the delay caused by contention on spin lock in wakeup() and amortizes the wakeup() calls scheduled over the 2 ms period. In the following example, the script enables the sys_enter_getuid and sys_exit_getuid tracepoints and runs the getuid_microbench in parallel with the given number of processes. The output clearly shows the latency increase caused by contentions. $ ~/getuid.sh 1 1000000 calls in 0.720974253 s (720.974253 ns/call) $ ~/getuid.sh 2 1000000 calls in 1.166457554 s (1166.457554 ns/call) 1000000 calls in 1.168933765 s (1168.933765 ns/call) $ ~/getuid.sh 3 1000000 calls in 1.783827516 s (1783.827516 ns/call) 1000000 calls in 1.795553270 s (1795.553270 ns/call) 1000000 calls in 1.796493376 s (1796.493376 ns/call) $ ~/getuid.sh 4 1000000 calls in 4.483041796 s (4483.041796 ns/call) 1000000 calls in 4.484165388 s (4484.165388 ns/call) 1000000 calls in 4.484850762 s (4484.850762 ns/call) 1000000 calls in 4.485643576 s (4485.643576 ns/call) $ ~/getuid.sh 5 1000000 calls in 6.497521653 s (6497.521653 ns/call) 1000000 calls in 6.502000236 s (6502.000236 ns/call) 1000000 calls in 6.501709115 s (6501.709115 ns/call) 1000000 calls in 6.502124100 s (6502.124100 ns/call) 1000000 calls in 6.502936358 s (6502.936358 ns/call) After the patch, the latencies scale better. 1000000 calls in 0.728720455 s (728.720455 ns/call) 1000000 calls in 0.842782857 s (842.782857 ns/call) 1000000 calls in 0.883803135 s (883.803135 ns/call) 1000000 calls in 0.902077764 s (902.077764 ns/call) 1000000 calls in 0.902838202 s (902.838202 ns/call) 1000000 calls in 0.908896885 s (908.896885 ns/call) 1000000 calls in 0.932523515 s (932.523515 ns/call) 1000000 calls in 0.958009672 s (958.009672 ns/call) 1000000 calls in 0.986188020 s (986.188020 ns/call) 1000000 calls in 0.989771102 s (989.771102 ns/call) 1000000 calls in 0.933518391 s (933.518391 ns/call) 1000000 calls in 0.958897947 s (958.897947 ns/call) 1000000 calls in 1.031038897 s (1031.038897 ns/call) 1000000 calls in 1.089516025 s (1089.516025 ns/call) 1000000 calls in 1.141998347 s (1141.998347 ns/call) Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1305059241-7629-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee9c921..71777c8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); +static void wakeup_work_handler(struct work_struct *work) +{ + wake_up(&trace_wait); +} + +static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); + /** * trace_wake_up - wake up tasks waiting for trace input * - * Simply wakes up any task that is blocked on the trace_wait - * queue. These is used with trace_poll for tasks polling the trace. + * Schedules a delayed work to wake up any task that is blocked on the + * trace_wait queue. These is used with trace_poll for tasks polling the + * trace. */ void trace_wake_up(void) { - int cpu; + const unsigned long delay = msecs_to_jiffies(2); if (trace_flags & TRACE_ITER_BLOCK) return; - /* - * The runqueue_is_locked() can fail, but this is the best we - * have for now: - */ - cpu = get_cpu(); - if (!runqueue_is_locked(cpu)) - wake_up(&trace_wait); - put_cpu(); + schedule_delayed_work(&wakeup_work, delay); } static int __init set_buf_size(char *str) -- cgit v1.1 From 7ea5906405a1f3fc1c0033dfd7e02f2cfd1de5e5 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 3 May 2011 17:56:42 -0700 Subject: tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 36 +++++++++++++++++++----------------- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 7 +++---- 3 files changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b0c7aa4..725153d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -997,13 +997,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct buffer_page *bpage, *tmp; - unsigned long addr; LIST_HEAD(pages); unsigned i; WARN_ON(!nr_pages); for (i = 0; i < nr_pages; i++) { + struct page *page; + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) @@ -1013,10 +1014,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } @@ -1045,7 +1047,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long addr; + struct page *page; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), @@ -1067,10 +1069,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto fail_free_reader; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -1314,7 +1316,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) unsigned nr_pages, rm_pages, new_pages; struct buffer_page *bpage, *tmp; unsigned long buffer_size; - unsigned long addr; LIST_HEAD(pages); int i, cpu; @@ -1375,16 +1376,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { + struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } } @@ -3730,16 +3732,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or NULL on error. */ -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { struct buffer_data_page *bpage; - unsigned long addr; + struct page *page; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) return NULL; - bpage = (void *)addr; + bpage = page_address(page); rb_init_page(bpage); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 302f8a6..a5457d5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -106,7 +106,7 @@ static enum event_status read_page(int cpu) int inc; int i; - bpage = ring_buffer_alloc_read_page(buffer); + bpage = ring_buffer_alloc_read_page(buffer, cpu); if (!bpage) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 71777c8..61fda6b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,7 +3697,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return 0; if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); if (!info->spare) return -ENOMEM; @@ -3854,7 +3854,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer); + ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); if (!ref->page) { kfree(ref); break; @@ -3863,8 +3863,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, r = ring_buffer_read_page(ref->buffer, &ref->page, len, info->cpu, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, - ref->page); + ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); break; } -- cgit v1.1 From 4f271a2a60c748599b30bb4dafff30d770439b96 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Mon, 13 Jun 2011 17:51:57 -0700 Subject: tracing: Add a proc file to stop tracing and free buffer The proc file entry buffer_size_kb is used to set the size of tracing buffer. The memory to expand the buffer size is kernel memory. Consider a use case where tracing is handled by a user space utility, which acts as a gate keeper for tracing requests. In an OOM condition, tracing is considered a low priority task and if the utility gets killed the ring buffer memory cannot be released back to the kernel. This patch adds a proc file called "free_buffer" whose purpose is to stop tracing and free up the ring buffer when it is closed. The user space process can then set the desired size in buffer_size_kb file and open the fd to the "free_buffer" file. Under OOM condition, if the process gets killed, the kernel closes the file descriptor. The release handler stops the tracing and releases the kernel memory automatically. Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Link: http://lkml.kernel.org/r/1308012717-11148-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 108 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 35 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 61fda6b..9c557ae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2768,7 +2768,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static int tracing_resize_ring_buffer(unsigned long size) +static int __tracing_resize_ring_buffer(unsigned long size) { int ret; @@ -2820,6 +2820,41 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } +static ssize_t tracing_resize_ring_buffer(unsigned long size) +{ + int cpu, ret = size; + + mutex_lock(&trace_types_lock); + + tracing_stop(); + + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + + if (size != global_trace.entries) + ret = __tracing_resize_ring_buffer(size); + + if (ret < 0) + ret = -ENOMEM; + + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + + tracing_start(); + mutex_unlock(&trace_types_lock); + + return ret; +} + /** * tracing_update_buffers - used by tracing facility to expand ring buffers @@ -2837,7 +2872,7 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); mutex_unlock(&trace_types_lock); return ret; @@ -2861,7 +2896,7 @@ static int tracing_set_tracer(const char *buf) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) goto out; ret = 0; @@ -3436,7 +3471,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret, cpu; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -3454,48 +3489,43 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (!val) return -EINVAL; - mutex_lock(&trace_types_lock); - - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); - } - /* value is in KB */ val <<= 10; - if (val != global_trace.entries) { - ret = tracing_resize_ring_buffer(val); - if (ret < 0) { - cnt = ret; - goto out; - } - } + ret = tracing_resize_ring_buffer(val); + if (ret < 0) + return ret; *ppos += cnt; - /* If check pages failed, return ENOMEM */ - if (tracing_disabled) - cnt = -ENOMEM; - out: - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } + return cnt; +} - tracing_start(); - mutex_unlock(&trace_types_lock); +static ssize_t +tracing_free_buffer_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* + * There is no need to read what the user has written, this function + * is just to make sure that there is no error when "echo" is used + */ + + *ppos += cnt; return cnt; } +static int +tracing_free_buffer_release(struct inode *inode, struct file *filp) +{ + /* disable tracing */ + tracing_off(); + /* resize the ring buffer to 0 */ + tracing_resize_ring_buffer(0); + + return 0; +} + static int mark_printk(const char *fmt, ...) { int ret; @@ -3641,6 +3671,11 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_free_buffer_fops = { + .write = tracing_free_buffer_write, + .release = tracing_free_buffer_release, +}; + static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, @@ -4365,6 +4400,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, + &global_trace, &tracing_free_buffer_fops); + trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); -- cgit v1.1 From cf30cf67d6c7592c670ec946d89fc15ee0deb0eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 22:44:07 -0400 Subject: tracing: Add disable_on_free option Add a trace option to disable tracing on free. When this option is set, a write into the free_buffer file will not only shrink the ring buffer down to zero, but it will also disable tracing. Cc: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++-- kernel/trace/trace.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9c557ae..42fdf3a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -425,6 +425,7 @@ static const char *trace_options[] = { "graph-time", "record-cmd", "overwrite", + "disable_on_free", NULL }; @@ -3518,8 +3519,9 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { - /* disable tracing */ - tracing_off(); + /* disable tracing ? */ + if (trace_flags & TRACE_ITER_STOP_ON_FREE) + tracing_off(); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(0); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f859..742f545 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -609,6 +609,7 @@ enum trace_iterator_flags { TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, + TRACE_ITER_STOP_ON_FREE = 0x400000, }; /* -- cgit v1.1 From bd38c0e6f98326132a691d73b2056b426423c638 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Tue, 31 May 2011 20:51:55 +0100 Subject: ftrace: Fixed an include coding style issue Removed because was already declared. Braces of struct's coding style fixed. Cc: Frederic Weisbecker Signed-off-by: Paul McQuade Link: http://lkml.kernel.org/r/4DE59711.3090900@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ee417f..e153807 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,7 +32,6 @@ #include -#include #include #include "trace_output.h" @@ -82,8 +81,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = -{ +static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, }; @@ -785,8 +783,7 @@ static void unregister_ftrace_profiler(void) unregister_ftrace_graph(); } #else -static struct ftrace_ops ftrace_profile_ops __read_mostly = -{ +static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; -- cgit v1.1 From 321e68b095addc473ca52ced9acfa59be88f76e6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:47 +0200 Subject: tracing, function_graph: Remove dependency of abstime and duration fields on latency The display of absolute time and duration fields is based on the latency field. This was added during the irqsoff/wakeup tracers graph support changes. It's causing confusion in what fields will be displayed for the function_graph tracer itself. So I'm removing this depency, and adding absolute time and duration fields to the preemptirqsoff preemptoff irqsoff wakeup tracers. With following commands: # echo function_graph > ./current_tracer # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 0) 0.068 us | } /* page_add_file_rmap */ 0) | _raw_spin_unlock() { ... This is what it looks like now: # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 0) 0.068 us | } /* add_preempt_count */ 0) 0.993 us | } /* vfsmount_lock_local_lock */ ... For preemptirqsoff preemptoff irqsoff wakeup tracers, this is what it looked like before: SNIP # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU TASK/PID ||||| DURATION FUNCTION CALLS # | | | ||||| | | | | | | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... This is what it looks like now: SNIP # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 19.847735 | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-2-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 19 +++---------------- kernel/trace/trace_irqsoff.c | 4 +++- kernel/trace/trace_sched_wakeup.c | 4 +++- 3 files changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 962cdb2..352652e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1207,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -__print_graph_function_flags(struct trace_iterator *iter, u32 flags) +print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1270,18 +1270,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return __print_graph_function_flags(iter, tracer_flags.val); -} - -enum print_line_t print_graph_function_flags(struct trace_iterator *iter, - u32 flags) -{ - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - return __print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, tracer_flags.val); } static enum print_line_t @@ -1364,9 +1353,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) return; print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; + } __print_graph_headers_flags(s, flags); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c77424b..667aa8c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ - TRACE_GRAPH_PRINT_PROC) + TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f029dd4..e4a70c0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter) graph_trace_close(iter); } -#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { -- cgit v1.1 From ffeb80fc30acbf6bd51cb47a1815f621a9d017dc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:48 +0200 Subject: tracing, function_graph: Merge overhead and duration display functions Functions print_graph_overhead() and print_graph_duration() displays data for one field - DURATION. I merged them into single function print_graph_duration(), and added a way to display the empty parts of the field. This way the print_graph_irq() function can use this column to display the IRQ signs if needed and the DURATION field details stays inside the print_graph_duration() function. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-3-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 148 +++++++++++++++++------------------ 1 file changed, 74 insertions(+), 74 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 352652e..44b955f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = { static struct trace_array *graph_array; +/* + * DURATION column is being also used to display IRQ signs, + * following values are used by print_graph_irq and others + * to fill in space into DURATION column. + */ +enum { + DURATION_FILL_FULL = -1, + DURATION_FILL_START = -2, + DURATION_FILL_END = -3, +}; + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags); /* Add a function return address to the trace stack on thread info.*/ int @@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter, return next; } -/* Signal a overhead of time execution to the output */ -static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s, - u32 flags) -{ - /* If duration disappear, we don't need anything */ - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return 1; - - /* Non nested entry or return */ - if (duration == -1) - return trace_seq_printf(s, " "); - - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - return trace_seq_printf(s, "! "); - - /* Duration exceeded 10 msecs */ - if (duration > 10000ULL) - return trace_seq_printf(s, "+ "); - } - - return trace_seq_printf(s, " "); -} - static int print_graph_abs_time(u64 t, struct trace_seq *s) { unsigned long usecs_rem; @@ -650,9 +638,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + ret = print_graph_duration(DURATION_FILL_START, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; if (type == TRACE_GRAPH_ENT) ret = trace_seq_printf(s, "==========>"); @@ -662,9 +650,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Don't close the duration column if haven't one */ - if (flags & TRACE_GRAPH_PRINT_DURATION) - trace_seq_printf(s, " |"); + ret = print_graph_duration(DURATION_FILL_END, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; + ret = trace_seq_printf(s, "\n"); if (!ret) @@ -716,9 +705,48 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) } static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags) { - int ret; + int ret = -1; + + if (!(flags & TRACE_GRAPH_PRINT_DURATION)) + return TRACE_TYPE_HANDLED; + + /* No real adata, just filling the column with spaces */ + switch (duration) { + case DURATION_FILL_FULL: + ret = trace_seq_printf(s, " | "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_START: + ret = trace_seq_printf(s, " "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_END: + ret = trace_seq_printf(s, " |"); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + } + + /* Signal a overhead of time execution to the output */ + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + ret = trace_seq_printf(s, "! "); + /* Duration exceeded 10 msecs */ + else if (duration > 10000ULL) + ret = trace_seq_printf(s, "+ "); + } + + /* + * The -1 means we either did not exceed the duration tresholds + * or we dont want to print out the overhead. Either way we need + * to fill out the space. + */ + if (ret == -1) + ret = trace_seq_printf(s, " "); + + /* Catching here any failure happenned above */ + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; ret = trace_print_graph_duration(duration, s); if (ret != TRACE_TYPE_HANDLED) @@ -767,18 +795,11 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = 0; } - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -815,17 +836,10 @@ print_graph_entry_nested(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = call->func; } - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -1078,18 +1092,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -1146,17 +1153,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Indentation */ if (depth > 0) -- cgit v1.1 From f56e7f8efb4ec200364f690a9902713410e24d47 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:49 +0200 Subject: tracing, function: Fix trace header to follow context-info option The header display of function tracer does not follow the context-info option, so field names are displayed even if this option is off. Added check for TRACE_ITER_CONTEXT_INFO trace_flags. With following commands: # echo function > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | add_preempt_count <-schedule rcu_note_context_switch <-schedule ... This is what it looks like now: # tracer: function # _raw_spin_unlock_irqrestore <-hrtimer_try_to_cancel ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-4-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42fdf3a..cf22b4b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2053,6 +2053,9 @@ void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v1.1 From 199abfab40389963b397c2982222e68ea782b2cf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:50 +0200 Subject: tracing, function_graph: Remove lock-depth from latency trace The lock_depth was removed in commit e6e1e25 tracing: Remove lock_depth from event entry Removing the lock_depth info from function_graph latency header. With following commands: # echo function_graph > ./current_tracer # echo 1 > options/latency-format # cat trace This is what it looked like before: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59756/311298, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU||||| DURATION FUNCTION CALLS # | ||||| | | | | | | 0) .... 0.068 us | } /* __rcu_read_unlock */ ... This is what it looks like now: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59747/1744610, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # CPU|||| DURATION FUNCTION CALLS # | |||| | | | | | | 0) ..s. 1.641 us | } /* __rcu_process_callbacks */ ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-5-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 44b955f..1da5b97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1298,8 +1298,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); - seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); - seq_printf(s, "#%.*s|||| / \n", size, spaces); + seq_printf(s, "#%.*s||| / \n", size, spaces); } static void __print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1318,7 +1317,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " TASK/PID "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); @@ -1332,7 +1331,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " | | "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); -- cgit v1.1 From 749230b06a753a22f6ed96e5dd60815d6ab12865 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:51 +0200 Subject: tracing, function_graph: Add context-info support for function_graph tracer The function_graph tracer does not follow global context-info option. Adding TRACE_ITER_CONTEXT_INFO trace_flags check to enable it. With following commands: # echo function_graph > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 1) 0.079 us | } /* __vma_link_rb */ 1) 0.056 us | copy_page_range(); 1) | security_vm_enough_memory() { ... This is what it looks like now: # tracer: function_graph # } /* update_ts_time_stats */ timekeeping_max_deferment(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-6-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 53 +++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 1da5b97..e8d6bb5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -613,28 +613,30 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return TRACE_TYPE_UNHANDLED; - /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Proc */ - if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } } /* No overhead */ @@ -710,8 +712,9 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, { int ret = -1; - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return TRACE_TYPE_HANDLED; + if (!(flags & TRACE_GRAPH_PRINT_DURATION) || + !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ switch (duration) { @@ -879,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return 0; + /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); @@ -1346,6 +1352,9 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) { struct trace_iterator *iter = s->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (trace_flags & TRACE_ITER_LATENCY_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v1.1 From 22fe9b54d859e53bfbbbdc1a0a77a82bc453927c Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Tue, 7 Jun 2011 21:58:27 +0200 Subject: tracing: Convert to kstrtoul_from_user This patch replaces the code for getting an unsigned long from a userspace buffer by a simple call to kstroul_from_user. This makes it easier to read and less error prone. Signed-off-by: Peter Huewe Link: http://lkml.kernel.org/r/1307476707-14762-1-git-send-email-peterhuewe@gmx.de Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 13 ++------- kernel/trace/ring_buffer.c | 13 ++------- kernel/trace/trace.c | 65 +++++++-------------------------------------- kernel/trace/trace_events.c | 26 +++--------------- kernel/trace/trace_stack.c | 13 ++------- 5 files changed, 20 insertions(+), 110 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e153807..458018a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -803,19 +803,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; /* big enough to hold a number */ int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 725153d..f00ede3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3980,20 +3980,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *p = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cf22b4b..c977018 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2706,20 +2706,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; @@ -3006,20 +2997,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *ptr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; *ptr = val * 1000; @@ -3474,19 +3456,10 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; /* must have at least 1 entry */ @@ -4139,19 +4112,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, { struct trace_option_dentry *topt = filp->private_data; unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) @@ -4199,20 +4163,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { long index = (long)filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec39..4d7e149 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -486,20 +486,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); @@ -571,19 +562,10 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char buf[64]; ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b0b53b8..77575b3 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, { long *ptr = filp->private_data; unsigned long val, flags; - char buf[64]; int ret; int cpu; - if (count >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, count, 10, &val); + if (ret) return ret; local_irq_save(flags); -- cgit v1.1 From d7ec4bfed6c97405c6417970ba06c439e08ab8e3 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 7 Jun 2011 17:01:42 -0700 Subject: ring-buffer: Set __GFP_NORETRY flag for ring buffer allocating process The tracing ring buffer is allocated from kernel memory. While allocating a large chunk of memory, OOM might happen which destabilizes the system. Thus random processes might get killed during the allocation. This patch adds __GFP_NORETRY flag to the ring buffer allocation calls to make it fail more gracefully if the system will not be able to complete the allocation request. Acked-by: David Rientjes Signed-off-by: Vaibhav Nagarnaik Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1307491302-9236-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f00ede3..731201b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1004,9 +1004,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { struct page *page; - + /* + * __GFP_NORETRY flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is + * not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; @@ -1015,7 +1020,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - GFP_KERNEL, 0); + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -1377,13 +1382,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { struct page *page; + /* + * __GFP_NORETRY flag makes sure that the allocation + * fails gracefully without invoking oom-killer and + * the system is not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -3737,7 +3749,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) struct buffer_data_page *bpage; struct page *page; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) return NULL; -- cgit v1.1 From 1fd8df2c3970c9e7e4e262354154ee39e58bdd7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:34 +0900 Subject: tracing/kprobes: Fix kprobe-tracer to support stack trace Fix to support kernel stack trace correctly on kprobe-tracer. Since the execution path of kprobe-based dynamic events is different from other tracepoint-based events, normal ftrace_trace_stack() doesn't work correctly. To fix that, this introduces ftrace_trace_stack_regs() which traces stack via pt_regs instead of current stack register. e.g. # echo p schedule+4 > /sys/kernel/debug/tracing/kprobe_events # echo 1 > /sys/kernel/debug/tracing/options/stacktrace # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable # head -n 20 /sys/kernel/debug/tracing/trace bash-2968 [000] 10297.050245: p_schedule_4: (schedule+0x4/0x4ca) bash-2968 [000] 10297.050247: => schedule_timeout => n_tty_read => tty_read => vfs_read => sys_read => system_call_fastpath kworker/0:1-2940 [000] 10297.050265: p_schedule_4: (schedule+0x4/0x4ca) kworker/0:1-2940 [000] 10297.050266: => worker_thread => kthread => kernel_thread_helper sshd-1132 [000] 10297.050365: p_schedule_4: (schedule+0x4/0x4ca) sshd-1132 [000] 10297.050365: => sysret_careful Note: Even with this fix, the first entry will be skipped if the probe is put on the function entry area before the frame pointer is set up (usually, that is 4 bytes (push %bp; mov %sp %bp) on x86), because stack unwinder depends on the frame pointer. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: yrl.pp-manager.tt@hitachi.com Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070934.17777.17116.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 34 +++++++++++++++++++++++++++++----- kernel/trace/trace.h | 9 +++++++++ kernel/trace/trace_kprobe.c | 6 ++++-- 3 files changed, 42 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c977018..d9c1612 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) +{ + ring_buffer_unlock_commit(buffer, event); + + ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + ftrace_trace_userstack(buffer, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); + void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { @@ -1238,7 +1250,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) + int skip, int pc, struct pt_regs *regs) { struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; @@ -1257,24 +1269,36 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; trace.entries = entry->caller; - save_stack_trace(&trace); + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); } +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, flags, skip, pc, regs); +} + void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, flags, skip, pc); + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); } void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc); + __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); } /** @@ -1290,7 +1314,7 @@ void trace_dump_stack(void) local_save_flags(flags); /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 742f545..a3e2db7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -389,6 +389,9 @@ void update_max_tr_single(struct trace_array *tr, void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc); +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); @@ -400,6 +403,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, { } +static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, + unsigned long flags, int skip, + int pc, struct pt_regs *regs) +{ +} + static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f925c45..7053ef3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1397,7 +1397,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Kretprobe handler */ @@ -1429,7 +1430,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Event entry printers */ -- cgit v1.1 From e9dbfae53eeb9fc3d4bb7da3df87fa9875f5da02 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 11:36:06 -0400 Subject: tracing: Fix bug when reading system filters on module removal The event system is freed when its nr_events is set to zero. This happens when a module created an event system and then later the module is removed. Modules may share systems, so the system is allocated when it is created and freed when the modules are unloaded and all the events under the system are removed (nr_events set to zero). The problem arises when a task opened the "filter" file for the system. If the module is unloaded and it removed the last event for that system, the system structure is freed. If the task that opened the filter file accesses the "filter" file after the system has been freed, the system will access an invalid pointer. By adding a ref_count, and using it to keep track of what is using the event system, we can free it after all users are finished with the event system. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 86 +++++++++++++++++++++++++++++++++----- kernel/trace/trace_events_filter.c | 6 +++ 3 files changed, 82 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f859..f807407 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,6 +677,7 @@ struct event_subsystem { struct dentry *entry; struct event_filter *filter; int nr_events; + int ref_count; }; #define FILTER_PRED_INVALID ((unsigned short)-1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec39..ffc5b28 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,6 +244,35 @@ static void ftrace_clear_events(void) mutex_unlock(&event_mutex); } +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system->ref_count == 0); + if (--system->ref_count) + return; + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system->ref_count == 0); + system->ref_count++; +} + +static void put_system(struct event_subsystem *system) +{ + mutex_lock(&event_mutex); + __put_system(system); + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -826,6 +855,47 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct event_subsystem *system = NULL; + int ret; + + /* Make sure the system still exists */ + mutex_lock(&event_mutex); + list_for_each_entry(system, &event_subsystems, list) { + if (system == inode->i_private) { + /* Don't open systems with no events */ + if (!system->nr_events) { + system = NULL; + break; + } + __get_system(system); + break; + } + } + mutex_unlock(&event_mutex); + + if (system != inode->i_private) + return -ENODEV; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + put_system(system); + + return ret; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct event_subsystem *system = inode->i_private; + + put_system(system); + + return 0; +} + static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -963,10 +1033,11 @@ static const struct file_operations ftrace_event_filter_fops = { }; static const struct file_operations ftrace_subsystem_filter_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { @@ -1002,8 +1073,6 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -static LIST_HEAD(event_subsystems); - static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { @@ -1013,6 +1082,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { + __get_system(system); system->nr_events++; return system->entry; } @@ -1035,6 +1105,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } system->nr_events = 1; + system->ref_count = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -1184,16 +1255,9 @@ static void remove_subsystem_dir(const char *name) list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { if (!--system->nr_events) { - struct event_filter *filter = system->filter; - debugfs_remove_recursive(system->entry); list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); + __put_system(system); } break; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8008ddc..256764ec 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + /* Make sure the system still has events */ + if (!system->nr_events) { + err = -ENODEV; + goto out_unlock; + } + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); -- cgit v1.1 From 40ee4dffff061399eb9358e0c8fcfbaf8de4c8fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 14:32:51 -0400 Subject: tracing: Have "enable" file use refcounts like the "filter" file The "enable" file for the event system can be removed when a module is unloaded and the event system only has events from that module. As the event system nr_events count goes to zero, it may be freed if its ref_count is also set to zero. Like the "filter" file, the "enable" file may be opened by a task and referenced later, after a module has been unloaded and the events for that event system have been removed. Although the "filter" file referenced the event system structure, the "enable" file only references a pointer to the event system name. Since the name is freed when the event system is removed, it is possible that an access to the "enable" file may reference a freed pointer. Update the "enable" file to use the subsystem_open() routine that the "filter" file uses, to keep a reference to the event system structure while the "enable" file is opened. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ffc5b28..3e2a7c9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -557,7 +557,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; int set = 0; @@ -568,7 +568,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (!call->name || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system) != 0) + if (system && strcmp(call->class->system, system->name) != 0) continue; /* @@ -598,7 +598,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; + const char *name = NULL; unsigned long val; char buf[64]; ssize_t ret; @@ -622,7 +623,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; - ret = __ftrace_set_clr_event(NULL, system, NULL, val); + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(NULL, name, NULL, val); if (ret) goto out; @@ -862,6 +870,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct event_subsystem *system = NULL; int ret; + if (!inode->i_private) + goto skip_search; + /* Make sure the system still exists */ mutex_lock(&event_mutex); list_for_each_entry(system, &event_subsystems, list) { @@ -880,8 +891,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (system != inode->i_private) return -ENODEV; + skip_search: ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0 && system) put_system(system); return ret; @@ -891,7 +903,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct event_subsystem *system = inode->i_private; - put_system(system); + if (system) + put_system(system); return 0; } @@ -1041,10 +1054,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { }; static const struct file_operations ftrace_system_enable_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_show_header_fops = { @@ -1133,8 +1147,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, - (void *)system->name, + trace_create_file("enable", 0644, system->entry, system, &ftrace_system_enable_fops); return system->entry; -- cgit v1.1 From 43dd61c9a09bd413e837df829e6bfb42159be52a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jul 2011 11:09:22 -0400 Subject: ftrace: Fix regression of :mod:module function enabling The new code that allows different utilities to pick and choose what functions they trace broke the :mod: hook that allows users to trace only functions of a particular module. The reason is that the :mod: hook bypasses the hash that is setup to allow individual users to trace their own functions and uses the global hash directly. But if the global hash has not been set up, it will cause a bug: echo '*:mod:radeon' > /sys/kernel/debug/set_ftrace_filter produces: [drm:drm_mode_getfb] *ERROR* invalid framebuffer id [drm:radeon_crtc_page_flip] *ERROR* failed to reserve new rbo buffer before flip BUG: unable to handle kernel paging request at ffffffff8160ec90 IP: [] add_hash_entry+0x66/0xd0 PGD 1a05067 PUD 1a09063 PMD 80000000016001e1 Oops: 0003 [#1] SMP Jul 7 04:02:28 phyllis kernel: [55303.858604] CPU 1 Modules linked in: cryptd aes_x86_64 aes_generic binfmt_misc rfcomm bnep ip6table_filter hid radeon r8169 ahci libahci mii ttm drm_kms_helper drm video i2c_algo_bit intel_agp intel_gtt Pid: 10344, comm: bash Tainted: G WC 3.0.0-rc5 #1 Dell Inc. Inspiron N5010/0YXXJJ RIP: 0010:[] [] add_hash_entry+0x66/0xd0 RSP: 0018:ffff88003a96bda8 EFLAGS: 00010246 RAX: ffff8801301735c0 RBX: ffffffff8160ec80 RCX: 0000000000306ee0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880137c92940 RBP: ffff88003a96bdb8 R08: ffff880137c95680 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff81c9df78 R13: ffff8801153d1000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f329c18a700(0000) GS:ffff880137c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff8160ec90 CR3: 000000003002b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process bash (pid: 10344, threadinfo ffff88003a96a000, task ffff88012fcfc470) Stack: 0000000000000fd0 00000000000000fc ffff88003a96be38 ffffffff810d92f5 ffff88011c4c4e00 ffff880000000000 000000000b69f4d0 ffffffff8160ec80 ffff8800300e6f06 0000000081130295 0000000000000282 ffff8800300e6f00 Call Trace: [] match_records+0x155/0x1b0 [] ftrace_mod_callback+0xbc/0x100 [] ftrace_regex_write+0x16f/0x210 [] ftrace_filter_write+0xf/0x20 [] vfs_write+0xc8/0x190 [] sys_write+0x51/0x90 [] system_call_fastpath+0x16/0x1b Code: 48 8b 33 31 d2 48 85 f6 75 33 49 89 d4 4c 03 63 08 49 8b 14 24 48 85 d2 48 89 10 74 04 48 89 42 08 49 89 04 24 4c 89 60 08 31 d2 RIP [] add_hash_entry+0x66/0xd0 RSP CR2: ffffffff8160ec90 ---[ end trace a5d031828efdd88e ]--- Reported-by: Brian Marete Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 12 +++--------- kernel/trace/trace_functions.c | 3 ++- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 908038f..1c4c0b0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2407,10 +2407,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) */ static int -ftrace_mod_callback(char *func, char *cmd, char *param, int enable) +ftrace_mod_callback(struct ftrace_hash *hash, + char *func, char *cmd, char *param, int enable) { - struct ftrace_ops *ops = &global_ops; - struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -2430,11 +2429,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return ret; - if (enable) - hash = ops->filter_hash; - else - hash = ops->notrace_hash; - ret = ftrace_match_module_records(hash, func, mod); if (!ret) ret = -EINVAL; @@ -2760,7 +2754,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(func, command, next, enable); + ret = p->func(hash, func, command, next, enable); goto out_unlock; } } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8d0e1cc..c7b0c6a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) } static int -ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; void *count = (void *)-1; -- cgit v1.1 From e4a3f541f0b67fdad98b326c851dfe7f4b6b6dad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 19:02:29 -0400 Subject: tracing: Still trace filtered irq functions when irq trace is disabled If a function is set to be traced by the set_graph_function, but the option funcgraph-irqs is zero, and the traced function happens to be called from a interrupt, it will not be traced. The point of funcgraph-irqs is to not trace interrupts when we are preempted by an irq, not to not trace functions we want to trace that happen to be *in* a irq. Luckily the current->trace_recursion element is perfect to add a flag to help us be able to trace functions within an interrupt even when we are not tracing interrupts that preempt the trace. Reported-by: Heiko Carstens Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 50 ++++++++++++++++++++++++------------ kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 35 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a3e2db7..651f35b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -278,6 +278,29 @@ struct tracer { }; +/* Only current can touch trace_recursion */ +#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) +#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) + +/* Ring buffer has the 10 LSB bits to count */ +#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) + +/* for function tracing recursion */ +#define TRACE_INTERNAL_BIT (1<<11) +#define TRACE_GLOBAL_BIT (1<<12) +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ +#define TRACE_IRQ_BIT (1<<13) + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + #define TRACE_PIPE_ALL_CPU -1 int tracer_init(struct tracer *t, struct trace_array *tr); @@ -516,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) + if (addr == ftrace_graph_funcs[i]) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); return 1; + } } return 0; @@ -794,19 +827,4 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" -/* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) - -/* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) - -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) - #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e8d6bb5..a7d2a4c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -227,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr, static inline int ftrace_graph_ignore_irqs(void) { - if (!ftrace_graph_skip_irqs) + if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; return in_irq(); -- cgit v1.1 From 4376cac66778b25e599be3f5d54f33f58ba8ead7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Jun 2011 23:28:13 -0400 Subject: ftrace: Do not disable interrupts for modules in mcount update When I mounted an NFS directory, it caused several modules to be loaded. At the time I was running the preemptirqsoff tracer, and it showed the following output: # tracer: preemptirqsoff # # preemptirqsoff latency trace v1.1.5 on 2.6.33.9-rt30-mrg-test # -------------------------------------------------------------------- # latency: 1177 us, #4/4, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: modprobe-19370 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # => started at: ftrace_module_notify # => ended at: ftrace_module_notify # # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| /_--=> lock-depth # |||||/ delay # cmd pid |||||| time | caller # \ / |||||| \ | / modprobe-19370 3d.... 0us!: ftrace_process_locs <-ftrace_module_notify modprobe-19370 3d.... 1176us : ftrace_process_locs <-ftrace_module_notify modprobe-19370 3d.... 1178us : trace_hardirqs_on <-ftrace_module_notify modprobe-19370 3d.... 1178us : => ftrace_process_locs => ftrace_module_notify => notifier_call_chain => __blocking_notifier_call_chain => blocking_notifier_call_chain => sys_init_module => system_call_fastpath That's over 1ms that interrupts are disabled on a Real-Time kernel! Looking at the cause (being the ftrace author helped), I found that the interrupts are disabled before the code modification of mcounts into nops. The interrupts only need to be disabled on start up around this code, not when modules are being loaded. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c997f73..df93392 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3318,7 +3318,7 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; - unsigned long flags; + unsigned long flags = 0; /* Shut up gcc */ mutex_lock(&ftrace_lock); p = start; @@ -3336,12 +3336,18 @@ static int ftrace_process_locs(struct module *mod, } /* - * Disable interrupts to prevent interrupts from executing - * code that is being modified. + * We only need to disable interrupts on start up + * because we are modifying code that an interrupt + * may execute, and the modification is not atomic. + * But for modules, nothing runs the code we modify + * until we are finished with it, and there's no + * reason to cause large interrupt latencies while we do it. */ - local_irq_save(flags); + if (!mod) + local_irq_save(flags); ftrace_update_code(mod); - local_irq_restore(flags); + if (!mod) + local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; -- cgit v1.1 From 41fb61c2d08107ce96a5dcb3a6289b2afd3e135c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:03:44 -0400 Subject: ftrace: Balance records when updating the hash Whenever the hash of the ftrace_ops is updated, the record counts must be balance. This requires disabling the records that are set in the original hash, and then enabling the records that are set in the updated hash. Moving the update into ftrace_hash_move() removes the bug where the hash was updated but the records were not, which results in ftrace triggering a warning and disabling itself because the ftrace_ops filter is updated while the ftrace_ops was registered, and then the failure happens when the ftrace_ops is unregistered. The current code will not trigger this bug, but new code will. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index df93392..853f6f0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1170,8 +1170,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; } +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + static int -ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; @@ -1181,9 +1187,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) unsigned long key; int size = src->count; int bits = 0; + int ret; int i; /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + + /* * If the new source is empty, just free dst and assign it * the empty_hash. */ @@ -1203,9 +1216,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; + ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + goto out; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1224,7 +1238,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - return 0; + ret = 0; + out: + /* + * Enable regardless of ret: + * On success, we enable the new hash. + * On failure, we re-enable the original hash. + */ + ftrace_hash_rec_enable(ops, enable); + + return ret; } /* @@ -2845,7 +2868,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ftrace_match_records(hash, buf, len); mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(orig_hash, hash); + ret = ftrace_hash_move(ops, enable, orig_hash, hash); mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); @@ -3028,18 +3051,12 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(iter->ops, filter_hash); - ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret) { - ftrace_hash_rec_enable(iter->ops, filter_hash); - if (iter->ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - } + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); -- cgit v1.1 From 072126f4529196f71a97960248bca54fd4554c2d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:08:31 -0400 Subject: ftrace: Update filter when tracing enabled in set_ftrace_filter() Currently, if set_ftrace_filter() is called when the ftrace_ops is active, the function filters will not be updated. They will only be updated when tracing is disabled and re-enabled. Update the functions immediately during set_ftrace_filter(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 853f6f0..a0dc0de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2869,6 +2869,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); -- cgit v1.1 From 6331c28c962561aee59e5a493b7556a4bb585957 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:11:02 -0400 Subject: ftrace: Fix dynamic selftest failure on some archs Archs that do not implement CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST, will fail the dynamic ftrace selftest. The function tracer has a quick 'off' variable that will prevent the call back functions from being called. This variable is called function_trace_stop. In x86, this is implemented directly in the mcount assembly, but for other archs, an intermediate function is used called ftrace_test_stop_func(). In dynamic ftrace, the function pointer variable ftrace_trace_function is used to update the caller code in the mcount caller. But for archs that do not have CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST set, it only calls ftrace_test_stop_func() instead, which in turn calls __ftrace_trace_function. When more than one ftrace_ops is registered, the function it calls is ftrace_ops_list_func(), which will iterate over all registered ftrace_ops and call the callbacks that have their hash matching. The issue happens when two ftrace_ops are registered for different functions and one is then unregistered. The __ftrace_trace_function is then pointed to the remaining ftrace_ops callback function directly. This mean it will be called for all functions that were registered to trace by both ftrace_ops that were registered. This is not an issue for archs with CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST, because the update of ftrace_trace_function doesn't happen until after all functions have been updated, and then the mcount caller is updated. But for those archs that do use the ftrace_test_stop_func(), the update is immediate. The dynamic selftest fails because it hits this situation, and the ftrace_ops that it registers fails to only trace what it was suppose to and instead traces all other functions. The solution is to delay the setting of __ftrace_trace_function until after all the functions have been updated according to the registered ftrace_ops. Also, function_trace_stop is set during the update to prevent function tracing from calling code that is caused by the function tracer itself. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a0dc0de..62e26d9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -88,6 +88,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -146,9 +147,11 @@ void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; __ftrace_trace_function = ftrace_stub; + __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } +#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their @@ -208,7 +211,12 @@ static void update_ftrace_function(void) #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; #else +#ifdef CONFIG_DYNAMIC_FTRACE + /* do not update till all functions have been modified */ + __ftrace_trace_function_delay = func; +#else __ftrace_trace_function = func; +#endif ftrace_trace_function = ftrace_test_stop_func; #endif } @@ -1607,6 +1615,12 @@ static int __ftrace_modify_code(void *data) { int *command = data; + /* + * Do not call function tracer while we update the code. + * We are in stop machine, no worrying about races. + */ + function_trace_stop++; + if (*command & FTRACE_ENABLE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) @@ -1620,6 +1634,18 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; + return 0; } -- cgit v1.1 From 4a9bd3f134decd6d16ead8d288342d57aad486be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Jul 2011 16:36:53 -0400 Subject: tracing: Have dynamic size event stack traces Currently the stack trace per event in ftace is only 8 frames. This can be quite limiting and sometimes useless. Especially when the "ignore frames" is wrong and we also use up stack frames for the event processing itself. Change this to be dynamic by adding a percpu buffer that we can write a large stack frame into and then copy into the ring buffer. For interrupts and NMIs that come in while another event is being process, will only get to use the 8 frame stack. That should be enough as the task that it interrupted will have the full stack frame anyway. Requested-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 92 +++++++++++++++++++++++++++++++++++++------- kernel/trace/trace_entries.h | 3 +- kernel/trace/trace_output.c | 11 +++--- 3 files changed, 87 insertions(+), 19 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9c1612..e5df02c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1248,6 +1248,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, } #ifdef CONFIG_STACKTRACE + +#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +struct ftrace_stack { + unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; +}; + +static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(int, ftrace_stack_reserve); + static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) @@ -1256,25 +1265,77 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; + int use_stack; + int size = FTRACE_STACK_ENTRIES; + + trace.nr_entries = 0; + trace.skip = skip; + + /* + * Since events can happen in NMIs there's no safe way to + * use the per cpu ftrace_stacks. We reserve it and if an interrupt + * or NMI comes in, it will just have to use the default + * FTRACE_STACK_SIZE. + */ + preempt_disable_notrace(); + + use_stack = ++__get_cpu_var(ftrace_stack_reserve); + /* + * We don't need any atomic variables, just a barrier. + * If an interrupt comes in, we don't care, because it would + * have exited and put the counter back to what we want. + * We just need a barrier to keep gcc from moving things + * around. + */ + barrier(); + if (use_stack == 1) { + trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.max_entries = FTRACE_STACK_MAX_ENTRIES; + + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + + if (trace.nr_entries > size) + size = trace.nr_entries; + } else + /* From now on, use_stack is a boolean */ + use_stack = 0; + + size *= sizeof(unsigned long); event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry), flags, pc); + sizeof(*entry) + size, flags, pc); if (!event) - return; - entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, sizeof(entry->caller)); + goto out; + entry = ring_buffer_event_data(event); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = skip; - trace.entries = entry->caller; + memset(&entry->caller, 0, size); + + if (use_stack) + memcpy(&entry->caller, trace.entries, + trace.nr_entries * sizeof(unsigned long)); + else { + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.entries = entry->caller; + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + } + + entry->size = trace.nr_entries; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + out: + /* Again, don't let gcc optimize things here */ + barrier(); + __get_cpu_var(ftrace_stack_reserve)--; + preempt_enable_notrace(); + } void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, @@ -1562,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, ftrace_enable_cpu(); - return event ? ring_buffer_event_data(event) : NULL; + if (event) { + iter->ent_size = ring_buffer_event_length(event); + return ring_buffer_event_data(event); + } + iter->ent_size = 0; + return NULL; } static struct trace_entry * diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e32744c..9336590 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, F_STRUCT( - __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + __field( int, size ) + __dynamic_array(unsigned long, caller ) ), F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e37de492..5199930 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, { struct stack_entry *field; struct trace_seq *s = &iter->seq; - int i; + unsigned long *p; + unsigned long *end; trace_assign_type(field, iter->ent); + end = (unsigned long *)((long)iter->ent + iter->ent_size); if (!trace_seq_puts(s, "\n")) goto partial; - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) - break; + + for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { if (!trace_seq_puts(s, " => ")) goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) + if (!seq_print_ip_sym(s, *p, flags)) goto partial; if (!trace_seq_puts(s, "\n")) goto partial; -- cgit v1.1 From 7143f168e2aa4bc7f8a8bcfe46d8dc52f7be869a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:36 +0900 Subject: tracing/kprobes: Rename probe_* to trace_probe_* Rename probe_* to trace_probe_* for avoiding namespace confliction. This also fixes improper names of find_probe_event() and cleanup_all_probes() to find_trace_probe() and release_all_trace_probes() respectively. Signed-off-by: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110627072636.6528.60374.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7db7b68..14b88ed 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -555,12 +555,12 @@ struct trace_probe { (sizeof(struct probe_arg) * (n))) -static __kprobes int probe_is_return(struct trace_probe *tp) +static __kprobes int trace_probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; } -static __kprobes const char *probe_symbol(struct trace_probe *tp) +static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) { return tp->symbol ? tp->symbol : "unknown"; } @@ -671,7 +671,7 @@ static void free_trace_probe(struct trace_probe *tp) kfree(tp); } -static struct trace_probe *find_probe_event(const char *event, +static struct trace_probe *find_trace_probe(const char *event, const char *group) { struct trace_probe *tp; @@ -686,7 +686,7 @@ static struct trace_probe *find_probe_event(const char *event, /* Unregister a trace_probe and probe_event: call with locking probe_lock */ static void unregister_trace_probe(struct trace_probe *tp) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) unregister_kretprobe(&tp->rp); else unregister_kprobe(&tp->rp.kp); @@ -703,7 +703,7 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); /* register as an event */ - old_tp = find_probe_event(tp->call.name, tp->call.class->system); + old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { /* delete old event */ unregister_trace_probe(old_tp); @@ -716,7 +716,7 @@ static int register_trace_probe(struct trace_probe *tp) } tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) ret = register_kretprobe(&tp->rp); else ret = register_kprobe(&tp->rp.kp); @@ -1025,7 +1025,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } mutex_lock(&probe_lock); - tp = find_probe_event(event, group); + tp = find_trace_probe(event, group); if (!tp) { mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); @@ -1144,7 +1144,7 @@ error: return ret; } -static void cleanup_all_probes(void) +static void release_all_trace_probes(void) { struct trace_probe *tp; @@ -1181,15 +1181,16 @@ static int probes_seq_show(struct seq_file *m, void *v) struct trace_probe *tp = v; int i; - seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); if (!tp->symbol) seq_printf(m, " 0x%p", tp->rp.kp.addr); else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + seq_printf(m, " %s+%u", trace_probe_symbol(tp), + tp->rp.kp.offset); else - seq_printf(m, " %s", probe_symbol(tp)); + seq_printf(m, " %s", trace_probe_symbol(tp)); for (i = 0; i < tp->nr_args; i++) seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); @@ -1209,7 +1210,7 @@ static int probes_open(struct inode *inode, struct file *file) { if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - cleanup_all_probes(); + release_all_trace_probes(); return seq_open(file, &probes_seq_op); } @@ -1518,7 +1519,7 @@ static int probe_event_enable(struct ftrace_event_call *call) struct trace_probe *tp = (struct trace_probe *)call->data; tp->flags |= TP_FLAG_TRACE; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) return enable_kretprobe(&tp->rp); else return enable_kprobe(&tp->rp.kp); @@ -1530,7 +1531,7 @@ static void probe_event_disable(struct ftrace_event_call *call) tp->flags &= ~TP_FLAG_TRACE; if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); @@ -1598,7 +1599,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) const char *fmt, *arg; - if (!probe_is_return(tp)) { + if (!trace_probe_is_return(tp)) { fmt = "(%lx)"; arg = "REC->" FIELD_STRING_IP; } else { @@ -1722,7 +1723,7 @@ static int probe_perf_enable(struct ftrace_event_call *call) tp->flags |= TP_FLAG_PROFILE; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) return enable_kretprobe(&tp->rp); else return enable_kprobe(&tp->rp.kp); @@ -1735,7 +1736,7 @@ static void probe_perf_disable(struct ftrace_event_call *call) tp->flags &= ~TP_FLAG_PROFILE; if (!(tp->flags & TP_FLAG_TRACE)) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); @@ -1807,7 +1808,7 @@ static int register_probe_event(struct trace_probe *tp) /* Initialize ftrace_event_call */ INIT_LIST_HEAD(&call->class->fields); - if (probe_is_return(tp)) { + if (trace_probe_is_return(tp)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; } else { @@ -1899,7 +1900,7 @@ static __init int kprobe_trace_self_tests_init(void) warn++; } else { /* Enable trace point */ - tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); + tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { pr_warning("error on getting new probe.\n"); warn++; @@ -1914,7 +1915,7 @@ static __init int kprobe_trace_self_tests_init(void) warn++; } else { /* Enable trace point */ - tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); + tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { pr_warning("error on getting new probe.\n"); warn++; @@ -1940,7 +1941,7 @@ static __init int kprobe_trace_self_tests_init(void) } end: - cleanup_all_probes(); + release_all_trace_probes(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else -- cgit v1.1 From f7bc8b61f65726ff98f52e286b28e294499d7a08 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Jul 2011 23:02:27 -0400 Subject: ftrace: Fix regression where ftrace breaks when modules are loaded Enabling function tracer to trace all functions, then load a module and then disable function tracing will cause ftrace to fail. This can also happen by enabling function tracing on the command line: ftrace=function and during boot up, modules are loaded, then you disable function tracing with 'echo nop > current_tracer' you will trigger a bug in ftrace that will shut itself down. The reason is, the new ftrace code keeps ref counts of all ftrace_ops that are registered for tracing. When one or more ftrace_ops are registered, all the records that represent the functions that the ftrace_ops will trace have a ref count incremented. If this ref count is not zero, when the code modification runs, that function will be enabled for tracing. If the ref count is zero, that function will be disabled from tracing. To make sure the accounting was working, FTRACE_WARN_ON()s were added to updating of the ref counts. If the ref count hits its max (> 2^30 ftrace_ops added), or if the ref count goes below zero, a FTRACE_WARN_ON() is triggered which disables all modification of code. Since it is common for ftrace_ops to trace all functions in the kernel, instead of creating > 20,000 hash items for the ftrace_ops, the hash count is just set to zero, and it represents that the ftrace_ops is to trace all functions. This is where the issues arrise. If you enable function tracing to trace all functions, and then add a module, the modules function records do not get the ref count updated. When the function tracer is disabled, all function records ref counts are subtracted. Since the modules never had their ref counts incremented, they go below zero and the FTRACE_WARN_ON() is triggered. The solution to this is rather simple. When modules are loaded, and their functions are added to the the ftrace pool, look to see if any ftrace_ops are registered that trace all functions. And for those, update the ref count for the module function records. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1c4c0b0..ef9271b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1744,10 +1744,36 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; +static int ops_traces_mod(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash; + + hash = ops->filter_hash; + return !!(!hash || !hash->count); +} + static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p; cycle_t start, stop; + unsigned long ref = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are set to trace all functions. + * If they are, we need to enable the module functions as well + * as update the reference counts for those function records. + */ + if (mod) { + struct ftrace_ops *ops; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + if (ops->flags & FTRACE_OPS_FL_ENABLED && + ops_traces_mod(ops)) + ref++; + } + } start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -1760,7 +1786,7 @@ static int ftrace_update_code(struct module *mod) p = ftrace_new_addrs; ftrace_new_addrs = p->newlist; - p->flags = 0L; + p->flags = ref; /* * Do the initial record conversion from mcount jump @@ -1783,7 +1809,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up) { + if (ftrace_start_up && ref) { int failed = __ftrace_replace_code(p, 1); if (failed) { ftrace_bug(failed, p->ip); -- cgit v1.1 From 1538f888f1e793de04e0f90372352ac1b05833cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:44 +0900 Subject: tracing/kprobes: Merge trace probe enable/disable functions Merge redundant enable/disable functions into enable_trace_probe() and disable_trace_probe(). Signed-off-by: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: yrl.pp-manager.tt@hitachi.com Cc: Frederic Weisbecker Cc: Ingo Molnar Link: http://lkml.kernel.org/r/20110627072644.6528.26910.stgit@fedora15 [ converted kprobe selftest to use enable_trace_probe ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 92 ++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 56 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14b88ed..3be50c5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -683,6 +683,34 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } +/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ +static int enable_trace_probe(struct trace_probe *tp, int flag) +{ + int ret = 0; + + tp->flags |= flag; + if (tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)) { + if (trace_probe_is_return(tp)) + ret = enable_kretprobe(&tp->rp); + else + ret = enable_kprobe(&tp->rp.kp); + } + + return ret; +} + +/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ +static void disable_trace_probe(struct trace_probe *tp, int flag) +{ + tp->flags &= ~flag; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (trace_probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + /* Unregister a trace_probe and probe_event: call with locking probe_lock */ static void unregister_trace_probe(struct trace_probe *tp) { @@ -1514,30 +1542,6 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -static int probe_event_enable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags |= TP_FLAG_TRACE; - if (trace_probe_is_return(tp)) - return enable_kretprobe(&tp->rp); - else - return enable_kprobe(&tp->rp.kp); -} - -static void probe_event_disable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags &= ~TP_FLAG_TRACE; - if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); - else - disable_kprobe(&tp->rp.kp); - } -} - #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ do { \ @@ -1716,49 +1720,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } - -static int probe_perf_enable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags |= TP_FLAG_PROFILE; - - if (trace_probe_is_return(tp)) - return enable_kretprobe(&tp->rp); - else - return enable_kprobe(&tp->rp.kp); -} - -static void probe_perf_disable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags &= ~TP_FLAG_PROFILE; - - if (!(tp->flags & TP_FLAG_TRACE)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); - else - disable_kprobe(&tp->rp.kp); - } -} #endif /* CONFIG_PERF_EVENTS */ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) { + struct trace_probe *tp = (struct trace_probe *)event->data; + switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(event); + return enable_trace_probe(tp, TP_FLAG_TRACE); case TRACE_REG_UNREGISTER: - probe_event_disable(event); + disable_trace_probe(tp, TP_FLAG_TRACE); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_perf_enable(event); + return enable_trace_probe(tp, TP_FLAG_PROFILE); case TRACE_REG_PERF_UNREGISTER: - probe_perf_disable(event); + disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; #endif } @@ -1905,7 +1885,7 @@ static __init int kprobe_trace_self_tests_init(void) pr_warning("error on getting new probe.\n"); warn++; } else - probe_event_enable(&tp->call); + enable_trace_probe(tp, TP_FLAG_TRACE); } ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " @@ -1920,7 +1900,7 @@ static __init int kprobe_trace_self_tests_init(void) pr_warning("error on getting new probe.\n"); warn++; } else - probe_event_enable(&tp->call); + enable_trace_probe(tp, TP_FLAG_TRACE); } if (warn) -- cgit v1.1 From 614243181050436785f5a621749a7da2336a7916 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:56 +0900 Subject: tracing/kprobes: Support module init function probing To support probing module init functions, kprobe-tracer allows user to define a probe on non-existed function when it is given with a module name. This also enables user to set a probe on a function on a specific module, even if a same name (but different) function is locally defined in another module. The module name must be in the front of function name and separated by a ':'. e.g. btrfs:btrfs_init_sysfs Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072656.6528.89970.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 164 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 138 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3be50c5..acc6664 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -536,6 +536,7 @@ struct probe_arg { /* Flags for trace_probe */ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 +#define TP_FLAG_REGISTERED 4 struct trace_probe { struct list_head list; @@ -565,6 +566,39 @@ static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } +static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) +{ + return tp->rp.kp.offset; +} + +static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + +static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) +{ + return !!(kprobe_gone(&tp->rp.kp)); +} + +static __kprobes bool trace_probe_within_module(struct trace_probe *tp, + struct module *mod) +{ + int len = strlen(mod->name); + const char *name = trace_probe_symbol(tp); + return strncmp(mod->name, name, len) == 0 && name[len] == ':'; +} + +static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) +{ + return !!strchr(trace_probe_symbol(tp), ':'); +} + static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -689,7 +723,8 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) int ret = 0; tp->flags |= flag; - if (tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)) { + if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && + !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) ret = enable_kretprobe(&tp->rp); else @@ -703,7 +738,7 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) static void disable_trace_probe(struct trace_probe *tp, int flag) { tp->flags &= ~flag; - if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else @@ -711,13 +746,64 @@ static void disable_trace_probe(struct trace_probe *tp, int flag) } } -/* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static void unregister_trace_probe(struct trace_probe *tp) +/* Internal register function - just handle k*probes and flags */ +static int __register_trace_probe(struct trace_probe *tp) { + int ret; + + if (trace_probe_is_registered(tp)) + return -EINVAL; + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(tp)) + tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + else + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); + ret = register_kretprobe(&tp->rp); else - unregister_kprobe(&tp->rp.kp); + ret = register_kprobe(&tp->rp.kp); + + if (ret == 0) + tp->flags |= TP_FLAG_REGISTERED; + else { + pr_warning("Could not insert probe at %s+%lu: %d\n", + trace_probe_symbol(tp), trace_probe_offset(tp), ret); + if (ret == -ENOENT && trace_probe_is_on_module(tp)) { + pr_warning("This probe might be able to register after" + "target module is loaded. Continue.\n"); + ret = 0; + } else if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + } + + return ret; +} + +/* Internal unregister function - just handle k*probes and flags */ +static void __unregister_trace_probe(struct trace_probe *tp) +{ + if (trace_probe_is_registered(tp)) { + if (trace_probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + tp->flags &= ~TP_FLAG_REGISTERED; + /* Cleanup kprobe for reuse */ + if (tp->rp.kp.symbol_name) + tp->rp.kp.addr = NULL; + } +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + __unregister_trace_probe(tp); list_del(&tp->list); unregister_probe_event(tp); } @@ -730,41 +816,65 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); - /* register as an event */ + /* Delete old (same name) event if exist */ old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { - /* delete old event */ unregister_trace_probe(old_tp); free_trace_probe(old_tp); } + + /* Register new event */ ret = register_probe_event(tp); if (ret) { pr_warning("Failed to register probe event(%d)\n", ret); goto end; } - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); - else - ret = register_kprobe(&tp->rp.kp); - - if (ret) { - pr_warning("Could not insert probe(%d)\n", ret); - if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tp->rp.kp.addr); - ret = -EINVAL; - } + /* Register k*probe */ + ret = __register_trace_probe(tp); + if (ret < 0) unregister_probe_event(tp); - } else + else list_add_tail(&tp->list, &probe_list); + end: mutex_unlock(&probe_lock); return ret; } +/* Module notifier call back, checking event on the module */ +static int trace_probe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct trace_probe *tp; + int ret; + + if (val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + /* Update probes on coming module */ + mutex_lock(&probe_lock); + list_for_each_entry(tp, &probe_list, list) { + if (trace_probe_within_module(tp, mod)) { + __unregister_trace_probe(tp); + ret = __register_trace_probe(tp); + if (ret) + pr_warning("Failed to re-register probe %s on" + "%s: %d\n", + tp->call.name, mod->name, ret); + } + } + mutex_unlock(&probe_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block trace_probe_module_nb = { + .notifier_call = trace_probe_module_callback, + .priority = 1 /* Invoked after kprobe module callback */ +}; + /* Split symbol and offset. */ static int split_symbol_offset(char *symbol, unsigned long *offset) { @@ -990,8 +1100,8 @@ static int create_trace_probe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -1186,7 +1296,6 @@ static void release_all_trace_probes(void) mutex_unlock(&probe_lock); } - /* Probes listing interfaces */ static void *probes_seq_start(struct seq_file *m, loff_t *pos) { @@ -1827,6 +1936,9 @@ static __init int init_kprobe_trace(void) struct dentry *d_tracer; struct dentry *entry; + if (register_module_notifier(&trace_probe_module_nb)) + return -EINVAL; + d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; -- cgit v1.1 From 7f6878a3d707b947603e09d95df0c3a98987e3a4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:03 +0900 Subject: tracing/kprobe: Update symbol reference when loading module Since the address of a module-local variable can only be solved after the target module is loaded, the symbol fetch-argument should be updated when loading target module. Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072703.6528.75042.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index acc6664..5fb3697 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) DEFINE_FETCH_deref(string_size) +static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) @@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string_size NULL static __kprobes void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + +static __kprobes void free_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* @@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) free_symbol_cache(data->orig.data); kfree(data); } + /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -680,6 +702,16 @@ error: return ERR_PTR(ret); } +static void update_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + update_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + update_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + update_symbol_cache(arg->fetch.data); +} + static void free_probe_arg(struct probe_arg *arg) { if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) @@ -749,11 +781,14 @@ static void disable_trace_probe(struct trace_probe *tp, int flag) /* Internal register function - just handle k*probes and flags */ static int __register_trace_probe(struct trace_probe *tp) { - int ret; + int i, ret; if (trace_probe_is_registered(tp)) return -EINVAL; + for (i = 0; i < tp->nr_args; i++) + update_probe_arg(&tp->args[i]); + /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(tp)) tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; -- cgit v1.1 From 60063497a95e716c9a689af3be2687d261f115b4 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:06 -0700 Subject: atomic: use This allows us to move duplicated code in (atomic_inc_not_zero() for now) to Signed-off-by: Arun Sharma Reviewed-by: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Cc: Eric Dumazet Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace.h | 2 +- kernel/trace/trace_mmiotrace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3f381d0..616846b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2,7 +2,7 @@ #define _LINUX_KERNEL_TRACE_H #include -#include +#include #include #include #include diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 017fa37..fd3c8aa 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include "trace.h" #include "trace_output.h" -- cgit v1.1