From cf6ab6d9143b157786bf29bca5c32e55234bb07d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 15 Dec 2014 20:13:31 -0500 Subject: tracing: Add ref count to tracer for when they are being read by pipe When one of the trace pipe files are being read (by either the trace_pipe or trace_pipe_raw), do not allow the current_trace to change. By adding a ref count that is incremented when the pipe files are opened, will prevent the current_trace from being changed. This will allow for the removal of the global trace_types_lock from reading the pipe buffers (which is currently a bottle neck). Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 +++++++++++++++- kernel/trace/trace.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e76797..ed3fba1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4140,6 +4140,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) goto out; } + /* If trace pipe files are being read, we can't change the tracer */ + if (tr->current_trace->ref) { + ret = -EBUSY; + goto out; + } + trace_branch_disable(); tr->current_trace->enabled--; @@ -4363,6 +4369,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->trace->pipe_open(iter); nonseekable_open(inode, filp); + + tr->current_trace->ref++; out: mutex_unlock(&trace_types_lock); return ret; @@ -4382,6 +4390,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); + tr->current_trace->ref--; + if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -5331,6 +5341,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; + tr->current_trace->ref++; + mutex_unlock(&trace_types_lock); ret = nonseekable_open(inode, filp); @@ -5437,6 +5449,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); + iter->tr->current_trace->ref--; + __trace_array_put(iter->tr); if (info->spare) @@ -6416,7 +6430,7 @@ static int instance_delete(const char *name) goto out_unlock; ret = -EBUSY; - if (tr->ref) + if (tr->ref || (tr->current_trace && tr->current_trace->ref)) goto out_unlock; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8de48ba..0eddfeb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -388,6 +388,7 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; int enabled; + int ref; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE -- cgit v1.1 From d716ff71dd12bc6328f84a9ec1c3647daf01c827 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 15 Dec 2014 22:31:07 -0500 Subject: tracing: Remove taking of trace_types_lock in pipe files Taking the global mutex "trace_types_lock" in the trace_pipe files causes a bottle neck as most the pipe files can be read per cpu and there's no reason to serialize them. The current_trace variable was given a ref count and it can not change when the ref count is not zero. Opening the trace_pipe files will up the ref count (and decremented on close), so that the lock no longer needs to be taken when accessing the current_trace variable. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 110 +++++++++++++-------------------------------------- 1 file changed, 28 insertions(+), 82 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed3fba1..7669b1f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4332,17 +4332,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) } trace_seq_init(&iter->seq); - - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ - iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) { - ret = -ENOMEM; - goto fail; - } - *iter->trace = *tr->current_trace; + iter->trace = tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -4399,7 +4389,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) free_cpumask_var(iter->started); mutex_destroy(&iter->mutex); - kfree(iter->trace); kfree(iter); trace_array_put(tr); @@ -4432,7 +4421,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* Must be called with trace_types_lock mutex held. */ +/* Must be called with iter->mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; @@ -4477,7 +4466,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - struct trace_array *tr = iter->tr; ssize_t sret; /* return any leftover data */ @@ -4487,12 +4475,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, trace_seq_init(&iter->seq); - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself @@ -4652,7 +4634,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -4660,12 +4641,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (splice_grow_spd(pipe, &spd)) return -ENOMEM; - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - mutex_lock(&iter->mutex); if (iter->trace->splice_read) { @@ -5373,21 +5348,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!count) return 0; - mutex_lock(&trace_types_lock); - #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - size = -EBUSY; - goto out_unlock; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif if (!info->spare) info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - size = -ENOMEM; if (!info->spare) - goto out_unlock; + return -ENOMEM; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -5403,21 +5373,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter)) { - if ((filp->f_flags & O_NONBLOCK)) { - size = -EAGAIN; - goto out_unlock; - } - mutex_unlock(&trace_types_lock); + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + ret = wait_on_pipe(iter, false); - mutex_lock(&trace_types_lock); - if (ret) { - size = ret; - goto out_unlock; - } + if (ret) + return ret; + goto again; } - size = 0; - goto out_unlock; + return 0; } info->read = 0; @@ -5427,18 +5392,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) { - size = -EFAULT; - goto out_unlock; - } + if (ret == size) + return -EFAULT; + size -= ret; *ppos += size; info->read += size; - out_unlock: - mutex_unlock(&trace_types_lock); - return size; } @@ -5536,30 +5497,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int entries, size, i; ssize_t ret = 0; - mutex_lock(&trace_types_lock); - #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) { - ret = -ENOMEM; - goto out; - } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) { - ret = -EINVAL; - goto out; - } + if (*ppos & (PAGE_SIZE - 1)) + return -EINVAL; if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) { - ret = -EINVAL; - goto out; - } + if (len < PAGE_SIZE) + return -EINVAL; len &= PAGE_MASK; } @@ -5620,25 +5571,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - goto out; + return ret; + + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) + return -EAGAIN; - if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { - ret = -EAGAIN; - goto out; - } - mutex_unlock(&trace_types_lock); ret = wait_on_pipe(iter, true); - mutex_lock(&trace_types_lock); if (ret) - goto out; + return ret; goto again; } ret = splice_to_pipe(pipe, &spd); splice_shrink_spd(&spd); -out: - mutex_unlock(&trace_types_lock); return ret; } -- cgit v1.1 From 3efb5f21a36fbddd524cffe36426a84622ce580e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 11:28:28 -0500 Subject: tracing: Remove unneeded includes of debugfs.h and fs.h The creation of tracing files and directories is for the most part encapsulated in helper functions in trace.c. Other files do not need to include debugfs.h or fs.h, as they may have needed to in the past. Remove them from the files that do not need them. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 -- kernel/trace/trace_branch.c | 1 - kernel/trace/trace_export.c | 2 -- kernel/trace/trace_irqsoff.c | 2 -- kernel/trace/trace_nop.c | 2 -- kernel/trace/trace_printk.c | 2 -- kernel/trace/trace_sched_switch.c | 2 -- kernel/trace/trace_sched_wakeup.c | 2 -- kernel/trace/trace_stack.c | 2 -- 9 files changed, 17 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7a4104c..9607918 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include /* for self test */ @@ -23,7 +22,6 @@ #include #include #include -#include #include diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7d6e2af..57cbf1e 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4ddde2..12e2b99 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -6,12 +6,10 @@ #include #include #include -#include #include #include #include #include -#include #include "trace_output.h" diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9bb104f..8523ea3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -10,11 +10,9 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include -#include #include #include #include -#include #include "trace.h" diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index fcf0a9e..8bb2071 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -6,8 +6,6 @@ */ #include -#include -#include #include #include "trace.h" diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index c4e70b6..7ee4b5c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -5,7 +5,6 @@ * */ #include -#include #include #include #include @@ -15,7 +14,6 @@ #include #include #include -#include #include "trace.h" diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 2e293be..419ca37 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -5,8 +5,6 @@ * */ #include -#include -#include #include #include #include diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 8fb84b3..d6e1003 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -10,8 +10,6 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include -#include -#include #include #include #include diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 16eddb3..e80927b 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -7,12 +7,10 @@ #include #include #include -#include #include #include #include #include -#include #include -- cgit v1.1 From 14a5ae40f0def33a422a45b2ed09198adb7bf11c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 11:14:16 -0500 Subject: tracing: Use IS_ERR() check for return value of tracing_init_dentry() tracing_init_dentry() will soon return NULL as a valid pointer for the top level tracing directroy. NULL can not be used as an error value. Instead, switch to ERR_PTR() and check the return status with IS_ERR(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_printk.c | 2 +- kernel/trace/trace_stack.c | 2 +- kernel/trace/trace_stat.c | 2 +- kernel/trace/trace_uprobe.c | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 929a733d..80c9d34 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5419,7 +5419,7 @@ static __init int ftrace_init_debugfs(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; ftrace_init_dyn_debugfs(d_tracer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7669b1f..acd2755 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5820,7 +5820,7 @@ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) return tr->dir; if (!debugfs_initialized()) - return NULL; + return ERR_PTR(-ENODEV); if (tr->flags & TRACE_ARRAY_FL_GLOBAL) tr->dir = debugfs_create_dir("tracing", NULL); @@ -5844,7 +5844,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + if (IS_ERR(d_tracer)) return NULL; tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); @@ -6047,7 +6047,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) return tr->options; d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + if (IS_ERR(d_tracer)) return NULL; tr->options = debugfs_create_dir("options", d_tracer); @@ -6538,7 +6538,7 @@ static __init int tracer_init_debugfs(void) trace_access_lock_init(); d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; init_tracer_debugfs(&global_trace, d_tracer); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 366a78a..4ff8c13 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2490,7 +2490,7 @@ static __init int event_trace_init(void) return -ENODEV; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; entry = debugfs_create_file("available_events", 0444, d_tracer, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ba47600..2d25ad1 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1437,7 +1437,7 @@ static __init int init_graph_debugfs(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("max_graph_depth", 0644, d_tracer, diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5edb518..b4a00de 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1320,7 +1320,7 @@ static __init int init_kprobe_trace(void) return -EINVAL; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; entry = debugfs_create_file("kprobe_events", 0644, d_tracer, diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7ee4b5c..36c1455 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -347,7 +347,7 @@ static __init int init_trace_printk_function_export(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("printk_formats", 0444, d_tracer, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e80927b..c3e4fcf 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -460,7 +460,7 @@ static __init int stack_trace_init(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("stack_max_size", 0644, d_tracer, diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 7af6736..75e19e8 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -276,7 +276,7 @@ static int tracing_stat_init(void) struct dentry *d_tracing; d_tracing = tracing_init_dentry(); - if (!d_tracing) + if (IS_ERR(d_tracing)) return 0; stat_dir = debugfs_create_dir("trace_stat", d_tracing); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8520acc..5f0eba9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1321,7 +1321,7 @@ static __init int init_uprobe_trace(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("uprobe_events", 0644, d_tracer, -- cgit v1.1 From 69a1c994cc540cc84469acf9952f72b899b38e8b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 27 Jan 2015 17:17:20 +0100 Subject: tracing: Remove newline from trace_printk warning banner Remove the output-confusing newline below: [ 0.191328] ********************************************************** [ 0.191493] ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** [ 0.191586] ** ** ... Link: http://lkml.kernel.org/r/1422375440-31970-1-git-send-email-bp@alien8.de Signed-off-by: Borislav Petkov [ added an extra '\n' by itself, to keep what it was suppose to do ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index acd2755..f82e53b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2036,7 +2036,8 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n**********************************************************\n"); + pr_warning("\n"); + pr_warning("**********************************************************\n"); pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warning("** **\n"); pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); -- cgit v1.1 From 6ea22486ba46bcb665de36514094d74575cd1330 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Wed, 28 Jan 2015 12:48:53 +0000 Subject: tracing: Add array printing helper If a trace event contains an array, there is currently no standard way to format this for text output. Drivers are currently hacking around this by a) local hacks that use the trace_seq functionailty directly, or b) just not printing that information. For fixed size arrays, formatting of the elements can be open-coded, but this gets cumbersome for arrays of non-trivial size. These approaches result in non-standard content of the event format description delivered to userspace, so userland tools needs to be taught to understand and parse each array printing method individually. This patch implements a __print_array() helper that tracepoint implementations can use instead of reinventing it. A simple C-style syntax is used to delimit the array and its elements {like,this}. So that the helpers can be used with large static arrays as well as dynamic arrays, they take a pointer and element count: they can be used with __get_dynamic_array() for use with dynamic arrays. Link: http://lkml.kernel.org/r/1422449335-8289-2-git-send-email-javi.merino@arm.com Cc: Ingo Molnar Signed-off-by: Dave Martin Signed-off-by: Javi Merino Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 ++++ include/trace/ftrace.h | 9 +++++++++ kernel/trace/trace_output.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0bebb5c..5aa4a92 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -44,6 +44,10 @@ const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, const char *ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len); +const char *ftrace_print_array_seq(struct trace_seq *p, + const void *buf, int buf_len, + size_t el_size); + struct trace_iterator; struct trace_event; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 139b506..304901f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -263,6 +263,14 @@ #undef __print_hex #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len) +#undef __print_array +#define __print_array(array, count, el_size) \ + ({ \ + BUILD_BUG_ON(el_size != 1 && el_size != 2 && \ + el_size != 4 && el_size != 8); \ + ftrace_print_array_seq(p, array, count, el_size); \ + }) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace enum print_line_t \ @@ -674,6 +682,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __get_dynamic_array_len #undef __get_str #undef __get_bitmask +#undef __print_array #undef TP_printk #define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b77b9a6..692bf71 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) } EXPORT_SYMBOL(ftrace_print_hex_seq); +const char * +ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len, + size_t el_size) +{ + const char *ret = trace_seq_buffer_ptr(p); + const char *prefix = ""; + void *ptr = (void *)buf; + + trace_seq_putc(p, '{'); + + while (ptr < buf + buf_len) { + switch (el_size) { + case 1: + trace_seq_printf(p, "%s0x%x", prefix, + *(u8 *)ptr); + break; + case 2: + trace_seq_printf(p, "%s0x%x", prefix, + *(u16 *)ptr); + break; + case 4: + trace_seq_printf(p, "%s0x%x", prefix, + *(u32 *)ptr); + break; + case 8: + trace_seq_printf(p, "%s0x%llx", prefix, + *(u64 *)ptr); + break; + default: + trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size, + *(u8 *)ptr); + el_size = 1; + } + prefix = ","; + ptr += el_size; + } + + trace_seq_putc(p, '}'); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_array_seq); + int ftrace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { -- cgit v1.1 From da194930ede6d87945786f72b1c36c7a4ed0f3a6 Mon Sep 17 00:00:00 2001 From: Tina Ruchandani Date: Wed, 28 Jan 2015 19:46:11 +0530 Subject: trace: Use 64-bit timekeeping The ring_buffer_producer uses 'struct timeval' to measure its start and end times. 'struct timeval' on 32-bit systems will have its tv_sec value overflow in year 2038 and beyond. This patch replaces struct timeval with 'ktime_t' which uses 64-bit representation for nanoseconds. Link: http://lkml.kernel.org/r/20150128141611.GA2701@tinar Suggested-by: Arnd Bergmann Suggested-by: Steven Rostedt Signed-off-by: Tina Ruchandani Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 3f9e328..13d945c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include struct rb_page { @@ -17,7 +17,7 @@ struct rb_page { }; /* run time and sleep time in seconds */ -#define RUN_TIME 10 +#define RUN_TIME 10ULL #define SLEEP_TIME 10 /* number of events for writer to wake up the reader */ @@ -212,8 +212,7 @@ static void ring_buffer_consumer(void) static void ring_buffer_producer(void) { - struct timeval start_tv; - struct timeval end_tv; + ktime_t start_time, end_time, timeout; unsigned long long time; unsigned long long entries; unsigned long long overruns; @@ -227,7 +226,8 @@ static void ring_buffer_producer(void) * make the system stall) */ trace_printk("Starting ring buffer hammer\n"); - do_gettimeofday(&start_tv); + start_time = ktime_get(); + timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC); do { struct ring_buffer_event *event; int *entry; @@ -244,7 +244,7 @@ static void ring_buffer_producer(void) ring_buffer_unlock_commit(buffer, event); } } - do_gettimeofday(&end_tv); + end_time = ktime_get(); cnt++; if (consumer && !(cnt % wakeup_interval)) @@ -264,7 +264,7 @@ static void ring_buffer_producer(void) cond_resched(); #endif - } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + } while (ktime_before(end_time, timeout) && !kill_test); trace_printk("End ring buffer hammer\n"); if (consumer) { @@ -280,9 +280,7 @@ static void ring_buffer_producer(void) wait_for_completion(&read_done); } - time = end_tv.tv_sec - start_tv.tv_sec; - time *= USEC_PER_SEC; - time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + time = ktime_us_delta(end_time, start_time); entries = ring_buffer_entries(buffer); overruns = ring_buffer_overruns(buffer); -- cgit v1.1 From c602894814adc93589dde028182101818c5f938b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 26 Jan 2015 20:38:39 -0500 Subject: tracing: Make tracing_init_dentry_tr() static tracing_init_dentry_tr() is not used outside of trace.c, it should be static. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f82e53b..2668a0d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5815,7 +5815,7 @@ static __init int register_snapshot_cmd(void) static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +static struct dentry *tracing_init_dentry_tr(struct trace_array *tr) { if (tr->dir) return tr->dir; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0eddfeb..dd8205a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -542,7 +542,6 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); -struct dentry *tracing_init_dentry_tr(struct trace_array *tr); struct dentry *tracing_init_dentry(void); struct ring_buffer_event; -- cgit v1.1 From 7eeafbcab47fe9860e5106286db83d60e3f35644 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 26 Jan 2015 21:00:48 -0500 Subject: tracing: Separate out initializing top level dir from instances The top level trace array is treated a little different than the instances, as it has to deal with more of the general tracing. The tr->dir is the tracing directory, which is an immutable dentry, where as the tr->dir of instances are the dentry that was created, and can be destroyed later. These should have different functions accessing them. As only tracing_init_dentry() deals with the top level array, fold the code for it into that function, and remove the trace_init_dentry_tr() that was also used by the instances to get their directory dentry. Add a tracing_get_dentry() to just get the tracing dir entry for instances as well as the top level array. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2668a0d..5afce60 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5815,28 +5815,11 @@ static __init int register_snapshot_cmd(void) static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -static struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (tr->dir) - return tr->dir; - - if (!debugfs_initialized()) - return ERR_PTR(-ENODEV); - - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - tr->dir = debugfs_create_dir("tracing", NULL); - - if (!tr->dir) - pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return tr->dir; } -struct dentry *tracing_init_dentry(void) -{ - return tracing_init_dentry_tr(&global_trace); -} - static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { struct dentry *d_tracer; @@ -5844,7 +5827,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) if (tr->percpu_dir) return tr->percpu_dir; - d_tracer = tracing_init_dentry_tr(tr); + d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; @@ -6047,7 +6030,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) if (tr->options) return tr->options; - d_tracer = tracing_init_dentry_tr(tr); + d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; @@ -6532,6 +6515,33 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) } +/** + * tracing_init_dentry - initialize top level trace array + * + * This is called when creating files or directories in the tracing + * directory. It is called via fs_initcall() by any of the boot up code + * and expects to return the dentry of the top level tracing directory. + */ +struct dentry *tracing_init_dentry(void) +{ + struct trace_array *tr = &global_trace; + + if (tr->dir) + return tr->dir; + + if (WARN_ON(!debugfs_initialized())) + return ERR_PTR(-ENODEV); + + tr->dir = debugfs_create_dir("tracing", NULL); + + if (!tr->dir) { + pr_warn_once("Could not create debugfs directory 'tracing'\n"); + return ERR_PTR(-ENOMEM); + } + + return tr->dir; +} + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -6772,7 +6782,6 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; - if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; -- cgit v1.1 From 4e20e3a60b57efa1e5c26324ce0260d58be6c81b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 9 Feb 2015 15:27:04 -0500 Subject: tracing: Update the TRACE_EVENT fields available in the sample code The sample code in samples/trace_events/ is extremely out of date and does not show all the new fields that have been added since the sample code was written. As most people are unaware of these new fields, adding sample code and explanations of those fields should help out. Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.c | 19 +++- samples/trace_events/trace-events-sample.h | 145 ++++++++++++++++++++++++++--- 2 files changed, 150 insertions(+), 14 deletions(-) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index aabc4e9..16c15c0 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -10,12 +10,29 @@ #define CREATE_TRACE_POINTS #include "trace-events-sample.h" +static const char *random_strings[] = { + "Mother Goose", + "Snoopy", + "Gandalf", + "Frodo", + "One ring to rule them all" +}; static void simple_thread_func(int cnt) { + int array[6]; + int len = cnt % 5; + int i; + set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); - trace_foo_bar("hello", cnt); + + for (i = 0; i < len; i++) + array[i] = i + 1; + array[i] = 0; + + trace_foo_bar("hello", cnt, array, random_strings[len], + tsk_cpus_allowed(current)); } static int simple_thread(void *arg) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 4764292..dd65f7b 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -1,6 +1,6 @@ /* * If TRACE_SYSTEM is defined, that will be the directory created - * in the ftrace directory under /sys/kernel/debug/tracing/events/ + * in the ftrace directory under /sys/kernel/tracing/events/ * * The define_trace.h below will also look for a file name of * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. @@ -54,44 +54,163 @@ * Here it is simply "foo, bar". * * struct: This defines the way the data will be stored in the ring buffer. - * There are currently two types of elements. __field and __array. - * a __field is broken up into (type, name). Where type can be any - * primitive type (integer, long or pointer). __field_struct() can - * be any static complex data value (struct, union, but not an array). - * For an array. there are three fields. (type, name, size). The - * type of elements in the array, the name of the field and the size - * of the array. + * The items declared here become part of a special structure + * called "__entry", which can be used in the fast_assign part of the + * TRACE_EVENT macro. + * + * Here are the currently defined types you can use: + * + * __field : Is broken up into type and name. Where type can be any + * primitive type (integer, long or pointer). + * + * __field(int, foo) + * + * __entry->foo = 5; + * + * __field_struct : This can be any static complex data type (struct, union + * but not an array). Be careful using complex types, as each + * event is limited in size, and copying large amounts of data + * into the ring buffer can slow things down. + * + * __field_struct(struct bar, foo) + * + * __entry->bar.x = y; + + * __array: There are three fields (type, name, size). The type is the + * type of elements in teh array, the name is the name of the array. + * size is the number of items in the array (not the total size). + * + * __array( char, foo, 10) is the same as saying: char foo[10]; + * + * Assigning arrays can be done like any array: + * + * __entry->foo[0] = 'a'; + * + * memcpy(__entry->foo, bar, 10); + * + * __dynamic_array: This is similar to array, but can vary is size from + * instance to instance of the tracepoint being called. + * Like __array, this too has three elements (type, name, size); + * type is the type of the element, name is the name of the array. + * The size is different than __array. It is not a static number, + * but the algorithm to figure out the length of the array for the + * specific instance of tracepoint. Again, size is the numebr of + * items in the array, not the total length in bytes. + * + * __dynamic_array( int, foo, bar) is similar to: int foo[bar]; + * + * Note, unlike arrays, you must use the __get_dynamic_array() macro + * to access the array. + * + * memcpy(__get_dynamic_array(foo), bar, 10); + * + * Notice, that "__entry" is not needed here. + * + * __string: This is a special kind of __dynamic_array. It expects to + * have a nul terminated character array passed to it (it allows + * for NULL too, which would be converted into "(null)"). __string + * takes two paramenter (name, src), where name is the name of + * the string saved, and src is the string to copy into the + * ring buffer. + * + * __string(foo, bar) is similar to: strcpy(foo, bar) + * + * To assign a string, use the helper macro __assign_str(). + * + * __assign_str(foo, bar); + * + * In most cases, the __assign_str() macro will take the same + * parameters as the __string() macro had to declare the string. + * + * __bitmask: This is another kind of __dynamic_array, but it expects + * an array of longs, and the number of bits to parse. It takes + * two parameters (name, nr_bits), where name is the name of the + * bitmask to save, and the nr_bits is the number of bits to record. + * + * __bitmask(target_cpu, nr_cpumask_bits) + * + * To assign a bitmask, use the __assign_bitmask() helper macro. + * + * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits); * - * __array( char, foo, 10) is the same as saying char foo[10]. * * fast_assign: This is a C like function that is used to store the items - * into the ring buffer. + * into the ring buffer. A special variable called "__entry" will be the + * structure that points into the ring buffer and has the same fields as + * described by the struct part of TRACE_EVENT above. * * printk: This is a way to print out the data in pretty print. This is * useful if the system crashes and you are logging via a serial line, * the data can be printed to the console using this "printk" method. + * This is also used to print out the data from the trace files. + * Again, the __entry macro is used to access the data from the ring buffer. + * + * Note, __dynamic_array, __string, and __bitmask require special helpers + * to access the data. + * + * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo) + * Use __get_dynamic_array_len(foo) to get the length of the array + * saved. + * + * For __string(foo, bar) use __get_str(foo) + * + * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus) + * * * Note, that for both the assign and the printk, __entry is the handler * to the data structure in the ring buffer, and is defined by the * TP_STRUCT__entry. */ + +/* + * It is OK to have helper functions in the file, but they need to be protected + * from being defined more than once. Remember, this file gets included more + * than once. + */ +#ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS +#define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS +static inline int __length_of(const int *list) +{ + int i; + + if (!list) + return 0; + + for (i = 0; list[i]; i++) + ; + return i; +} +#endif + TRACE_EVENT(foo_bar, - TP_PROTO(char *foo, int bar), + TP_PROTO(const char *foo, int bar, const int *lst, + const char *string, const struct cpumask *mask), - TP_ARGS(foo, bar), + TP_ARGS(foo, bar, lst, string, mask), TP_STRUCT__entry( __array( char, foo, 10 ) __field( int, bar ) + __dynamic_array(int, list, __length_of(lst)) + __string( str, string ) + __bitmask( cpus, num_possible_cpus() ) ), TP_fast_assign( strlcpy(__entry->foo, foo, 10); __entry->bar = bar; + memcpy(__get_dynamic_array(list), lst, + __length_of(lst) * sizeof(int)); + __assign_str(str, string); + __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); ), - TP_printk("foo %s %d", __entry->foo, __entry->bar) + TP_printk("foo %s %d %s %s (%s)", __entry->foo, __entry->bar, + __print_array(__get_dynamic_array(list), + __get_dynamic_array_len(list), + sizeof(int)), + __get_str(str), __get_bitmask(cpus)) ); #endif -- cgit v1.1 From c4c7eb29382c456b9be9858c357a490ae0ccd0f6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 9 Feb 2015 16:05:55 -0500 Subject: tracing: Add TRACE_EVENT_CONDITION sample The sample code lacks an example of TRACE_EVENT_CONDITION, and it has been expressed to me that this feature for TRACE_EVENT is not well known and not used when it could be. Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.c | 3 ++ samples/trace_events/trace-events-sample.h | 58 ++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 16c15c0..c396a49 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -31,8 +31,11 @@ static void simple_thread_func(int cnt) array[i] = i + 1; array[i] = 0; + /* Silly tracepoints */ trace_foo_bar("hello", cnt, array, random_strings[len], tsk_cpus_allowed(current)); + + trace_foo_bar_with_cond("Some times print", cnt); } static int simple_thread(void *arg) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index dd65f7b..c323234 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -212,6 +212,64 @@ TRACE_EVENT(foo_bar, sizeof(int)), __get_str(str), __get_bitmask(cpus)) ); + +/* + * There may be a case where a tracepoint should only be called if + * some condition is set. Otherwise the tracepoint should not be called. + * But to do something like: + * + * if (cond) + * trace_foo(); + * + * Would cause a little overhead when tracing is not enabled, and that + * overhead, even if small, is not something we want. As tracepoints + * use static branch (aka jump_labels), where no branch is taken to + * skip the tracepoint when not enabled, and a jmp is placed to jump + * to the tracepoint code when it is enabled, having a if statement + * nullifies that optimization. It would be nice to place that + * condition within the static branch. This is where TRACE_EVENT_CONDITION + * comes in. + * + * TRACE_EVENT_CONDITION() is just like TRACE_EVENT, except it adds another + * parameter just after args. Where TRACE_EVENT has: + * + * TRACE_EVENT(name, proto, args, struct, assign, printk) + * + * the CONDITION version has: + * + * TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, printk) + * + * Everything is the same as TRACE_EVENT except for the new cond. Think + * of the cond variable as: + * + * if (cond) + * trace_foo_bar_with_cond(); + * + * Except that the logic for the if branch is placed after the static branch. + * That is, the if statement that processes the condition will not be + * executed unless that traecpoint is enabled. Otherwise it still remains + * a nop. + */ +TRACE_EVENT_CONDITION(foo_bar_with_cond, + + TP_PROTO(const char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_CONDITION(!(bar % 10)), + + TP_STRUCT__entry( + __string( foo, foo ) + __field( int, bar ) + ), + + TP_fast_assign( + __assign_str(foo, foo); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __get_str(foo), __entry->bar) +); #endif /***** NOTICE! The #if protection ends here. *****/ -- cgit v1.1 From 6adc13f8c096736957444ffa2aa11421b5671aef Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 9 Feb 2015 16:32:19 -0500 Subject: tracing: Add TRACE_EVENT_FN example If a function should be called before a tracepoint is enabled and/or after it is disabled, the TRACE_EVENT_FN() serves this purpose. But it is not well documented. Having it as a sample would help developers to know how to use it. Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.c | 51 ++++++++++++++++++++++++++++++ samples/trace_events/trace-events-sample.h | 44 ++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index c396a49..39d4484 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -49,6 +49,52 @@ static int simple_thread(void *arg) } static struct task_struct *simple_tsk; +static struct task_struct *simple_tsk_fn; + +static void simple_thread_func_fn(int cnt) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + + /* More silly tracepoints */ + trace_foo_bar_with_fn("Look at me", cnt); +} + +static int simple_thread_fn(void *arg) +{ + int cnt = 0; + + while (!kthread_should_stop()) + simple_thread_func_fn(cnt++); + + return 0; +} + +static DEFINE_MUTEX(thread_mutex); + +void foo_bar_reg(void) +{ + pr_info("Starting thread for foo_bar_fn\n"); + /* + * We shouldn't be able to start a trace when the module is + * unloading (there's other locks to prevent that). But + * for consistency sake, we still take the thread_mutex. + */ + mutex_lock(&thread_mutex); + simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn"); + mutex_unlock(&thread_mutex); +} + +void foo_bar_unreg(void) +{ + pr_info("Killing thread for foo_bar_fn\n"); + /* protect against module unloading */ + mutex_lock(&thread_mutex); + if (simple_tsk_fn) + kthread_stop(simple_tsk_fn); + simple_tsk_fn = NULL; + mutex_unlock(&thread_mutex); +} static int __init trace_event_init(void) { @@ -62,6 +108,11 @@ static int __init trace_event_init(void) static void __exit trace_event_exit(void) { kthread_stop(simple_tsk); + mutex_lock(&thread_mutex); + if (simple_tsk_fn) + kthread_stop(simple_tsk_fn); + simple_tsk_fn = NULL; + mutex_unlock(&thread_mutex); } module_init(trace_event_init); diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index c323234..d0be841 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -270,6 +270,50 @@ TRACE_EVENT_CONDITION(foo_bar_with_cond, TP_printk("foo %s %d", __get_str(foo), __entry->bar) ); + +void foo_bar_reg(void); +void foo_bar_unreg(void); + +/* + * Now in the case that some function needs to be called when the + * tracepoint is enabled and/or when it is disabled, the + * TRACE_EVENT_FN() serves this purpose. This is just like TRACE_EVENT() + * but adds two more parameters at the end: + * + * TRACE_EVENT_FN( name, proto, args, struct, assign, printk, reg, unreg) + * + * reg and unreg are functions with the prototype of: + * + * void reg(void) + * + * The reg function gets called before the tracepoint is enabled, and + * the unreg function gets called after the tracepoint is disabled. + * + * Note, reg and unreg are allowed to be NULL. If you only need to + * call a function before enabling, or after disabling, just set one + * function and pass in NULL for the other parameter. + */ +TRACE_EVENT_FN(foo_bar_with_fn, + + TP_PROTO(const char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_STRUCT__entry( + __string( foo, foo ) + __field( int, bar ) + ), + + TP_fast_assign( + __assign_str(foo, foo); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __get_str(foo), __entry->bar), + + foo_bar_reg, foo_bar_unreg +); + #endif /***** NOTICE! The #if protection ends here. *****/ -- cgit v1.1 From 7496946a88ab48830f3101c08f8e770cc0902bbb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 9 Feb 2015 17:14:04 -0500 Subject: tracing: Add samples of DECLARE_EVENT_CLASS() and DEFINE_EVENT() Add to samples/trace_events/ the macros DECLARE_EVENT_CLASS() and DEFINE_EVENT() and recommend using them over multiple TRACE_EVENT() macros if the multiple events have the same format. Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.c | 7 +++ samples/trace_events/trace-events-sample.h | 81 ++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 39d4484..880a7d1 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -35,7 +35,13 @@ static void simple_thread_func(int cnt) trace_foo_bar("hello", cnt, array, random_strings[len], tsk_cpus_allowed(current)); + trace_foo_with_template_simple("HELLO", cnt); + trace_foo_bar_with_cond("Some times print", cnt); + + trace_foo_with_template_cond("prints other times", cnt); + + trace_foo_with_template_print("I have to be different", cnt); } static int simple_thread(void *arg) @@ -58,6 +64,7 @@ static void simple_thread_func_fn(int cnt) /* More silly tracepoints */ trace_foo_bar_with_fn("Look at me", cnt); + trace_foo_with_template_fn("Look at me too", cnt); } static int simple_thread_fn(void *arg) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index d0be841..a2c8b02 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -314,6 +314,87 @@ TRACE_EVENT_FN(foo_bar_with_fn, foo_bar_reg, foo_bar_unreg ); +/* + * Each TRACE_EVENT macro creates several helper functions to produce + * the code to add the tracepoint, create the files in the trace + * directory, hook it to perf, assign the values and to print out + * the raw data from the ring buffer. To prevent too much bloat, + * if there are more than one tracepoint that uses the same format + * for the proto, args, struct, assign and printk, and only the name + * is different, it is highly recommended to use the DECLARE_EVENT_CLASS + * + * DECLARE_EVENT_CLASS() macro creates most of the functions for the + * tracepoint. Then DEFINE_EVENT() is use to hook a tracepoint to those + * functions. This DEFINE_EVENT() is an instance of the class and can + * be enabled and disabled separately from other events (either TRACE_EVENT + * or other DEFINE_EVENT()s). + * + * Note, TRACE_EVENT() itself is simply defined as: + * + * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \ + * DEFINE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ + * DEFINE_EVENT(name, name, proto, args) + * + * The DEFINE_EVENT() also can be declared with conditions and reg functions: + * + * DEFINE_EVENT_CONDITION(template, name, proto, args, cond); + * DEFINE_EVENT_FN(template, name, proto, args, reg, unreg); + */ +DECLARE_EVENT_CLASS(foo_template, + + TP_PROTO(const char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_STRUCT__entry( + __string( foo, foo ) + __field( int, bar ) + ), + + TP_fast_assign( + __assign_str(foo, foo); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __get_str(foo), __entry->bar) +); + +/* + * Here's a better way for the previous samples (except, the first + * exmaple had more fields and could not be used here). + */ +DEFINE_EVENT(foo_template, foo_with_template_simple, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar)); + +DEFINE_EVENT_CONDITION(foo_template, foo_with_template_cond, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar), + TP_CONDITION(!(bar % 8))); + + +DEFINE_EVENT_FN(foo_template, foo_with_template_fn, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar), + foo_bar_reg, foo_bar_unreg); + +/* + * Anytime two events share basically the same values and have + * the same output, use the DECLARE_EVENT_CLASS() and DEFINE_EVENT() + * when ever possible. + */ + +/* + * If the event is similar to the DECLARE_EVENT_CLASS, but you need + * to have a different output, then use DEFINE_EVENT_PRINT() which + * lets you override the TP_printk() of the class. + */ + +DEFINE_EVENT_PRINT(foo_template, foo_with_template_print, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar), + TP_printk("bar %s %d", __get_str(foo), __entry->bar)); + #endif /***** NOTICE! The #if protection ends here. *****/ -- cgit v1.1 From 7215853e985a4bef1a6c14e00e89dfec84f1e457 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Wed, 17 Dec 2014 18:50:56 -0800 Subject: tracing: Fix unmapping loop in tracing_mark_write Commit 6edb2a8a385f0cdef51dae37ff23e74d76d8a6ce introduced an array map_pages that contains the addresses returned by kmap_atomic. However, when unmapping those pages, map_pages[0] is unmapped before map_pages[1], breaking the nesting requirement as specified in the documentation for kmap_atomic/kunmap_atomic. This was caught by the highmem debug code present in kunmap_atomic. Fix the loop to do the unmapping properly. Link: http://lkml.kernel.org/r/1418871056-6614-1-git-send-email-markivx@codeaurora.org Cc: stable@vger.kernel.org # 3.5+ Reviewed-by: Stephen Boyd Reported-by: Lime Yang Signed-off-by: Vikram Mulukutla Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5afce60..2078b86 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4928,7 +4928,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, *fpos += written; out_unlock: - for (i = 0; i < nr_pages; i++){ + for (i = nr_pages - 1; i >= 0; i--) { kunmap_atomic(map_page[i]); put_page(pages[i]); } -- cgit v1.1 From 1e0d6714aceb770b04161fbedd7765d0e1fc27bd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Feb 2015 22:14:53 -0500 Subject: ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9607918..5040d44 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -445,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; + wait_queue_head_t full_waiters; bool waiters_pending; + bool full_waiters_pending; + bool wakeup_full; }; /* @@ -527,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); + if (rbwork->wakeup_full) { + rbwork->wakeup_full = false; + wake_up_all(&rbwork->full_waiters); + } } /** @@ -551,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ - if (cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; - else { + /* Full only makes sense on per cpu reads */ + full = false; + } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; @@ -562,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) while (true) { - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); /* * The events can happen in critical sections where @@ -584,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ - work->waiters_pending = true; + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; if (signal_pending(current)) { ret = -EINTR; @@ -613,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) schedule(); } - finish_wait(&work->waiters, &wait); + if (full) + finish_wait(&work->full_waiters, &wait); + else + finish_wait(&work->waiters, &wait); return ret; } @@ -1228,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); + init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -2799,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { + bool pagebusy; + if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -2810,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /** -- cgit v1.1