From 9b33fa6ba0e2f90fdf407501db801c2511121564 Mon Sep 17 00:00:00 2001 From: "eranian@google.com" Date: Wed, 10 Mar 2010 22:26:05 -0800 Subject: perf_events: Improve task_sched_in() This patch is an optimization in perf_event_task_sched_in() to avoid scheduling the events twice in a row. Without it, the perf_disable()/perf_enable() pair is invoked twice, thereby pinned events counts while scheduling flexible events and we go throuh hw_perf_enable() twice. By encapsulating, the whole sequence into perf_disable()/perf_enable() we ensure, hw_perf_enable() is going to be invoked only once because of the refcount protection. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1268288765-5326-1-git-send-email-eranian@google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 52c69a3..3853d49 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1368,6 +1368,8 @@ void perf_event_task_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) return; + perf_disable(); + /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1380,6 +1382,8 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; + + perf_enable(); } #define MAX_INTERRUPTS (~0ULL) -- cgit v1.1 From 1d199b1ad606ae8b88acebd295b101c4e1cf2a57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 16 Mar 2010 01:05:02 +0100 Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs perf_arch_fetch_caller_regs() is exported for the overriden x86 version, but not for the generic weak version. As a general rule, weak functions should not have their symbol exported in the same file they are defined. So let's export it on trace_event_perf.c as it is used by trace events only. This fixes: ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined! ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined! -v2: And also only build it if trace events are enabled. -v3: Fix changelog mistake Reported-by: Stephen Rothwell Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Paul Mackerras LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 ++ kernel/trace/trace_event_perf.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8bf6127..455393e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2790,10 +2790,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } +#ifdef CONFIG_EVENT_TRACING __weak void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { } +#endif /* * Output diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0709e4f..7d79a10 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -12,6 +12,8 @@ DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); +EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); + static char *perf_trace_buf; static char *perf_trace_buf_nmi; -- cgit v1.1 From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2010 14:51:50 +0100 Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code Support for the PMU's BTS features has been upstreamed in v2.6.32, but we still have the old and disabled ptrace-BTS, as Linus noticed it not so long ago. It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without regard for other uses (perf) and doesn't provide the flexibility needed for perf either. Its users are ptrace-block-step and ptrace-bts, since ptrace-bts was never used and ptrace-block-step can be implemented using a much simpler approach. So axe all 3000 lines of it. That includes the *locked_memory*() APIs in mm/mlock.c as well. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Roland McGrath Cc: Oleg Nesterov Cc: Markus Metzger Cc: Steven Rostedt Cc: Andrew Morton LKML-Reference: <20100325135413.938004390@chello.nl> Signed-off-by: Ingo Molnar --- kernel/fork.c | 3 - kernel/ptrace.c | 1 - kernel/sched.c | 43 ------ kernel/trace/Kconfig | 11 -- kernel/trace/Makefile | 1 - kernel/trace/trace.h | 4 - kernel/trace/trace_entries.h | 12 -- kernel/trace/trace_hw_branches.c | 312 --------------------------------------- kernel/trace/trace_selftest.c | 57 ------- 9 files changed, 444 deletions(-) delete mode 100644 kernel/trace/trace_hw_branches.c (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4799c5f..d67f1db 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1108,9 +1108,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif - - p->bts = NULL; - p->stack_start = stack_start; /* Perform scheduler related setup. Assign this task to a CPU. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 42ad8ae..9fb5123 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -76,7 +76,6 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); - arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } diff --git a/kernel/sched.c b/kernel/sched.c index 9ab3cd7..117b7ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2077,49 +2077,6 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) } /* - * wait_task_context_switch - wait for a thread to complete at least one - * context switch. - * - * @p must not be current. - */ -void wait_task_context_switch(struct task_struct *p) -{ - unsigned long nvcsw, nivcsw, flags; - int running; - struct rq *rq; - - nvcsw = p->nvcsw; - nivcsw = p->nivcsw; - for (;;) { - /* - * The runqueue is assigned before the actual context - * switch. We need to take the runqueue lock. - * - * We could check initially without the lock but it is - * very likely that we need to take the lock in every - * iteration. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - task_rq_unlock(rq, &flags); - - if (likely(!running)) - break; - /* - * The switch count is incremented before the actual - * context switch. We thus wait for two switches to be - * sure at least one completed. - */ - if ((p->nvcsw - nvcsw) > 1) - break; - if ((p->nivcsw - nivcsw) > 1) - break; - - cpu_relax(); - } -} - -/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 13e13d4..8b1797c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD help See Documentation/trace/ftrace-design.txt -config HAVE_HW_BRANCH_TRACER - bool - config HAVE_SYSCALL_TRACEPOINTS bool help @@ -374,14 +371,6 @@ config STACK_TRACER Say N if unsure. -config HW_BRANCH_TRACER - depends on HAVE_HW_BRANCH_TRACER - bool "Trace hw branches" - select GENERIC_TRACER - help - This tracer records all branches on the system in a circular - buffer, giving access to the last N branches for each cpu. - config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 78edc64..ffb1a5b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2825ef2..bec2c97 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,7 +34,6 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, @@ -229,7 +228,6 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -467,8 +465,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c16a08f..dc008c1 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(hw_branch, hw_branch_entry, - - TRACE_HW_BRANCHES, - - F_STRUCT( - __field( u64, from ) - __field( u64, to ) - ), - - F_printk("from: %llx to: %llx", __entry->from, __entry->to) -); - FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, TRACE_KMEM_ALLOC, diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c deleted file mode 100644 index 7b97000..0000000 --- a/kernel/trace/trace_hw_branches.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * h/w branch tracer for x86 based on BTS - * - * Copyright (C) 2008-2009 Intel Corporation. - * Markus Metzger , 2008-2009 - */ -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace_output.h" -#include "trace.h" - - -#define BTS_BUFFER_SIZE (1 << 13) - -static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer); - -#define this_tracer per_cpu(hwb_tracer, smp_processor_id()) - -static int trace_hw_branches_enabled __read_mostly; -static int trace_hw_branches_suspended __read_mostly; -static struct trace_array *hw_branch_trace __read_mostly; - - -static void bts_trace_init_cpu(int cpu) -{ - per_cpu(hwb_tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu), - BTS_BUFFER_SIZE, NULL, (size_t)-1, - BTS_KERNEL); - - if (IS_ERR(per_cpu(hwb_tracer, cpu))) - per_cpu(hwb_tracer, cpu) = NULL; -} - -static int bts_trace_init(struct trace_array *tr) -{ - int cpu; - - hw_branch_trace = tr; - trace_hw_branches_enabled = 0; - - get_online_cpus(); - for_each_online_cpu(cpu) { - bts_trace_init_cpu(cpu); - - if (likely(per_cpu(hwb_tracer, cpu))) - trace_hw_branches_enabled = 1; - } - trace_hw_branches_suspended = 0; - put_online_cpus(); - - /* If we could not enable tracing on a single cpu, we fail. */ - return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - trace_hw_branches_enabled = 0; - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_start(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_stop(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 1; - put_online_cpus(); -} - -static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - /* The notification is sent with interrupts enabled. */ - if (trace_hw_branches_enabled) { - bts_trace_init_cpu(cpu); - - if (trace_hw_branches_suspended && - likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - } - break; - - case CPU_DOWN_PREPARE: - /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - - return NOTIFY_DONE; -} - -static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { - .notifier_call = bts_hotcpu_handler -}; - -static void bts_trace_print_header(struct seq_file *m) -{ - seq_puts(m, "# CPU# TO <- FROM\n"); -} - -static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) -{ - unsigned long symflags = TRACE_ITER_SYM_OFFSET; - struct trace_entry *entry = iter->ent; - struct trace_seq *seq = &iter->seq; - struct hw_branch_entry *it; - - trace_assign_type(it, entry); - - if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", iter->cpu) && - seq_print_ip_sym(seq, it->to, symflags) && - trace_seq_printf(seq, "\t <- ") && - seq_print_ip_sym(seq, it->from, symflags) && - trace_seq_printf(seq, "\n")) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_UNHANDLED; -} - -void trace_hw_branch(u64 from, u64 to) -{ - struct ftrace_event_call *call = &event_hw_branch; - struct trace_array *tr = hw_branch_trace; - struct ring_buffer_event *event; - struct ring_buffer *buf; - struct hw_branch_entry *entry; - unsigned long irq1; - int cpu; - - if (unlikely(!tr)) - return; - - if (unlikely(!trace_hw_branches_enabled)) - return; - - local_irq_save(irq1); - cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) - goto out; - - buf = tr->buffer; - event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_HW_BRANCHES; - entry->from = from; - entry->to = to; - if (!filter_check_discard(call, entry, buf, event)) - trace_buffer_unlock_commit(buf, event, 0, 0); - - out: - atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(irq1); -} - -static void trace_bts_at(const struct bts_trace *trace, void *at) -{ - struct bts_struct bts; - int err = 0; - - WARN_ON_ONCE(!trace->read); - if (!trace->read) - return; - - err = trace->read(this_tracer, at, &bts); - if (err < 0) - return; - - switch (bts.qualifier) { - case BTS_BRANCH: - trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); - break; - } -} - -/* - * Collect the trace on the current cpu and write it into the ftrace buffer. - * - * pre: tracing must be suspended on the current cpu - */ -static void trace_bts_cpu(void *arg) -{ - struct trace_array *tr = (struct trace_array *)arg; - const struct bts_trace *trace; - unsigned char *at; - - if (unlikely(!tr)) - return; - - if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) - return; - - if (unlikely(!this_tracer)) - return; - - trace = ds_read_bts(this_tracer); - if (!trace) - return; - - for (at = trace->ds.top; (void *)at < trace->ds.end; - at += trace->ds.size) - trace_bts_at(trace, at); - - for (at = trace->ds.begin; (void *)at < trace->ds.top; - at += trace->ds.size) - trace_bts_at(trace, at); -} - -static void trace_bts_prepare(struct trace_iterator *iter) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - /* - * We need to collect the trace on the respective cpu since ftrace - * implicitly adds the record for the current cpu. - * Once that is more flexible, we could collect the data from any cpu. - */ - on_each_cpu(trace_bts_cpu, iter->tr, 1); - - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - put_online_cpus(); -} - -static void trace_bts_close(struct trace_iterator *iter) -{ - tracing_reset_online_cpus(iter->tr); -} - -void trace_hw_branch_oops(void) -{ - if (this_tracer) { - ds_suspend_bts_noirq(this_tracer); - trace_bts_cpu(hw_branch_trace); - ds_resume_bts_noirq(this_tracer); - } -} - -struct tracer bts_tracer __read_mostly = -{ - .name = "hw-branch-tracer", - .init = bts_trace_init, - .reset = bts_trace_reset, - .print_header = bts_trace_print_header, - .print_line = bts_trace_print_line, - .start = bts_trace_start, - .stop = bts_trace_stop, - .open = trace_bts_prepare, - .close = trace_bts_close, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_hw_branches, -#endif /* CONFIG_FTRACE_SELFTEST */ -}; - -__init static int init_bts_trace(void) -{ - register_hotcpu_notifier(&bts_hotcpu_notifier); - return register_tracer(&bts_tracer); -} -device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea4..a7084e7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,7 +16,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_HW_BRANCHES: case TRACE_KSYM: return 1; } @@ -754,62 +753,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_HW_BRANCH_TRACER -int -trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr) -{ - struct trace_iterator *iter; - struct tracer tracer; - unsigned long count; - int ret; - - if (!trace->open) { - printk(KERN_CONT "missing open function..."); - return -1; - } - - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* - * The hw-branch tracer needs to collect the trace from the various - * cpu trace buffers - before tracing is stopped. - */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - memcpy(&tracer, trace, sizeof(tracer)); - - iter->trace = &tracer; - iter->tr = tr; - iter->pos = -1; - mutex_init(&iter->mutex); - - trace->open(iter); - - mutex_destroy(&iter->mutex); - kfree(iter); - - tracing_stop(); - - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT "no entries found.."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_HW_BRANCH_TRACER */ - #ifdef CONFIG_KSYM_TRACER static int ksym_selftest_dummy; -- cgit v1.1 From 3326c1ceee234e63160852720d48be8a8f7a6d08 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Mar 2010 19:09:33 +0100 Subject: perf_event: Make perf fd non seekable Perf_event does not need seeking, so prevent it in order to get rid of default_llseek, which uses the BKL. Signed-off-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar [drop the nonseekable_open, not needed for anon inodes] Signed-off-by: Frederic Weisbecker --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 63fbce1..4aa50ff 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2645,6 +2645,7 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { + .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, -- cgit v1.1 From 76e1d9047e4edefb8ada20aa90d5762306082bd6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 5 Apr 2010 15:35:57 +0200 Subject: perf: Store active software events in a hashlist Each time a software event triggers, we need to walk through the entire list of events from the current cpu and task contexts to retrieve a running perf event that matches. We also need to check a matching perf event is actually counting. This walk is wasteful and makes the event fast path scaling down with a growing number of events running on the same contexts. To solve this, we store the running perf events in a hashlist to get an immediate access to them against their type:event_id when they trigger. v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic maths along the way) - Only allocate hlist for online cpus, but keep track of the refcount on offline possible cpus too, so that we allocate it if needed when it becomes online. - Drop the kref use as it's not adapted to our tricks anymore. v3: - Fix bad refcount check (address instead of value). Thanks to Eric Dumazet who spotted this. - While exiting cpu, move the hlist release out of the IPI path to lock the hlist mutex sanely. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/perf_event.c | 246 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 183 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fcf42dcd..9efdfe5 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, nmi, data, regs); } -static int perf_swevent_is_counting(struct perf_event *event) -{ - /* - * The event is active, we're good! - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - return 1; - - /* - * The event is off/error, not counting. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE) - return 0; - - /* - * The event is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (event->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data); @@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - if (event->cpu != -1 && event->cpu != smp_processor_id()) - return 0; - - if (!perf_swevent_is_counting(event)) - return 0; - if (event->attr.type != type) return 0; @@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event, return 1; } -static void perf_swevent_ctx_event(struct perf_event_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ + u64 hash; + struct swevent_hlist *hlist; + + hash = swevent_hash(type, event_id); + + hlist = rcu_dereference(ctx->swevent_hlist); + if (!hlist) + return NULL; + + return &hlist->heads[hash]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { + struct perf_cpu_context *cpuctx; struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + cpuctx = &__get_cpu_var(perf_cpu_context); + + rcu_read_lock(); + + head = find_swevent_head(cpuctx, type, event_id); + + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } +end: + rcu_read_unlock(); } int perf_swevent_get_recursion_context(void) @@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx) } EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, - nr, nmi, data, regs); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); - rcu_read_unlock(); -} void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_enable(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct perf_cpu_context *cpuctx; + struct hlist_head *head; + + cpuctx = &__get_cpu_var(perf_cpu_context); if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } + + head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + return 0; } static void perf_swevent_disable(struct perf_event *event) { + hlist_del_rcu(&event->hlist_entry); } static const struct pmu perf_ops_generic = { @@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event, return 0; } +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) +{ + struct swevent_hlist *hlist; + + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +{ + struct swevent_hlist *hlist; + + if (!cpuctx->swevent_hlist) + return; + + hlist = cpuctx->swevent_hlist; + rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + + mutex_lock(&cpuctx->hlist_mutex); + + if (!--cpuctx->hlist_refcount) + swevent_hlist_release(cpuctx); + + mutex_unlock(&cpuctx->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + int err = 0; + + mutex_lock(&cpuctx->hlist_mutex); + + if (!cpuctx->swevent_hlist && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + cpuctx->hlist_refcount++; + exit: + mutex_unlock(&cpuctx->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; + fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_disable(event->attr.config); + swevent_hlist_put(event); } static const struct pmu *tp_perf_event_init(struct perf_event *event) { + int err; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return NULL; event->destroy = tp_perf_event_destroy; + err = swevent_hlist_get(event); + if (err) { + perf_trace_disable(event->attr.config); + return ERR_PTR(err); + } return &perf_ops_generic; } @@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); atomic_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); } static const struct pmu *sw_perf_event_init(struct perf_event *event) @@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_ALIGNMENT_FAULTS: case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return ERR_PTR(err); + atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); + mutex_init(&cpuctx->hlist_mutex); __perf_event_init_context(&cpuctx->ctx, NULL); } } @@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu) spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); + + mutex_lock(&cpuctx->hlist_mutex); + if (cpuctx->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + WARN_ON_ONCE(!hlist); + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + mutex_unlock(&cpuctx->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU @@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu) struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_event_context *ctx = &cpuctx->ctx; + mutex_lock(&cpuctx->hlist_mutex); + swevent_hlist_release(cpuctx); + mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); -- cgit v1.1 From df8290bf7ea6b3051e2f315579a6e829309ec1ed Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2010 00:28:14 +0200 Subject: perf: Make clock software events consistent with general exclusion rules The cpu/task clock events implement their own version of exclusion on top of exclude_user and exclude_kernel. The result is that when the event triggered in the kernel but we have exclude_kernel set, we try to rewind using task_pt_regs. There are two side effects of this: - we call task_pt_regs even on kernel threads, which doesn't give us the desired result. - if the event occured in the kernel, we shouldn't rewind to the user context. We want to actually ignore the event. get_irq_regs() will always give us the right interrupted context, so use its result and submit it to perf_exclude_context() that knows when an event must be ignored. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/perf_event.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9efdfe5..095101d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4164,15 +4164,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((event->attr.exclude_kernel || !regs) && - !event->attr.exclude_user) - regs = task_pt_regs(current); - if (regs) { + if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; -- cgit v1.1 From 93ccae7a2227466a0d071fe52c51319f2f34c365 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 12 Apr 2010 13:17:08 -0400 Subject: tracing/kprobes: Support basic types on dynamic events Support basic types of integer (u8, u16, u32, u64, s8, s16, s32, s64) in kprobe tracer. With this patch, users can specify above basic types on each arguments after ':'. If omitted, the argument type is set as unsigned long (u32 or u64, arch-dependent). e.g. echo 'p account_system_time+0 hardirq_offset=%si:s32' > kprobe_events adds a probe recording hardirq_offset in signed-32bits value on the entry of account_system_time. Cc: Ingo Molnar Cc: Steven Rostedt Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frederic Weisbecker LKML-Reference: <20100412171708.3790.18599.stgit@localhost6.localdomain6> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace.h | 16 +- kernel/trace/trace_kprobe.c | 535 +++++++++++++++++++++++++++----------------- 2 files changed, 331 insertions(+), 220 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bec2c97..3ebdb6b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -102,29 +102,17 @@ struct syscall_trace_exit { long ret; }; -struct kprobe_trace_entry { +struct kprobe_trace_entry_head { struct trace_entry ent; unsigned long ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - -struct kretprobe_trace_entry { +struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kretprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1251e36..a751432 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include "trace.h" #include "trace_output.h" @@ -40,7 +42,6 @@ /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_NARGS "__probe_nargs" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" @@ -52,56 +53,102 @@ const char *reserved_field_names[] = { "common_tgid", "common_lock_depth", FIELD_STRING_IP, - FIELD_STRING_NARGS, FIELD_STRING_RETIP, FIELD_STRING_FUNC, }; -struct fetch_func { - unsigned long (*func)(struct pt_regs *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, void *data)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); + +struct fetch_param { + fetch_func_t fn; void *data; }; -static __kprobes unsigned long call_fetch(struct fetch_func *f, - struct pt_regs *regs) +static __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) { - return f->func(regs, f->data); + return fprm->fn(regs, fprm->data, dest); } -/* fetch handlers */ -static __kprobes unsigned long fetch_register(struct pt_regs *regs, - void *offset) -{ - return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(kind) \ +DEFINE_FETCH_##kind(u8) \ +DEFINE_FETCH_##kind(u16) \ +DEFINE_FETCH_##kind(u32) \ +DEFINE_FETCH_##kind(u64) + +#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ + ((FETCH_FUNC_NAME(kind, u8) == fn) || \ + (FETCH_FUNC_NAME(kind, u16) == fn) || \ + (FETCH_FUNC_NAME(kind, u32) == fn) || \ + (FETCH_FUNC_NAME(kind, u64) == fn)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ } - -static __kprobes unsigned long fetch_stack(struct pt_regs *regs, - void *num) -{ - return regs_get_kernel_stack_nth(regs, - (unsigned int)((unsigned long)num)); +DEFINE_BASIC_FETCH_FUNCS(reg) + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ } +DEFINE_BASIC_FETCH_FUNCS(stack) -static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) -{ - unsigned long retval; - - if (probe_kernel_address(addr, retval)) - return 0; - return retval; +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ } - -static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, - void *dummy) -{ - return regs_return_value(regs); -} - -static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, - void *dummy) -{ - return kernel_stack_pointer(regs); +DEFINE_BASIC_FETCH_FUNCS(retval) + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ } +DEFINE_BASIC_FETCH_FUNCS(memory) /* Memory fetching by symbol */ struct symbol_cache { @@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) return sc; } -static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) -{ - struct symbol_cache *sc = data; - - if (sc->addr) - return fetch_memory(regs, (void *)sc->addr); - else - return 0; +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(symbol) -/* Special indirect memory access interface */ -struct indirect_fetch_data { - struct fetch_func orig; +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; long offset; }; -static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) -{ - struct indirect_fetch_data *ind = data; - unsigned long addr; - - addr = call_fetch(&ind->orig, regs); - if (addr) { - addr += ind->offset; - return fetch_memory(regs, (void *)addr); - } else - return 0; +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(deref) -static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (data->orig.func == fetch_indirect) - free_indirect_fetch_data(data->orig.data); - else if (data->orig.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(kind, type) \ + .kind = FETCH_FUNC_NAME(kind, type) + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + {.name = #ptype, \ + .size = sizeof(ftype), \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } + +/* Fetch type information table */ +static const struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + /* Fetch functions */ + fetch_func_t reg; + fetch_func_t stack; + fetch_func_t retval; + fetch_func_t memory; + fetch_func_t symbol; + fetch_func_t deref; +} fetch_type_table[] = { + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + /** * Kprobe event core functions */ struct probe_arg { - struct fetch_func fetch; - const char *name; + struct fetch_param fetch; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ }; /* Flags for trace_probe */ @@ -204,6 +326,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_call call; struct trace_event event; + ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; @@ -212,6 +335,7 @@ struct trace_probe { (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) + static __kprobes int probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; @@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } -static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) -{ - int ret = -EINVAL; - - if (ff->func == fetch_register) { - const char *name; - name = regs_query_register_name((unsigned int)((long)ff->data)); - ret = snprintf(buf, n, "%%%s", name); - } else if (ff->func == fetch_stack) - ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); - else if (ff->func == fetch_memory) - ret = snprintf(buf, n, "@0x%p", ff->data); - else if (ff->func == fetch_symbol) { - struct symbol_cache *sc = ff->data; - if (sc->offset) - ret = snprintf(buf, n, "@%s%+ld", sc->symbol, - sc->offset); - else - ret = snprintf(buf, n, "@%s", sc->symbol); - } else if (ff->func == fetch_retvalue) - ret = snprintf(buf, n, "$retval"); - else if (ff->func == fetch_stack_address) - ret = snprintf(buf, n, "$stack"); - else if (ff->func == fetch_indirect) { - struct indirect_fetch_data *id = ff->data; - size_t l = 0; - ret = snprintf(buf, n, "%+ld(", id->offset); - if (ret >= n) - goto end; - l += ret; - ret = probe_arg_string(buf + l, n - l, &id->orig); - if (ret < 0) - goto end; - l += ret; - ret = snprintf(buf + l, n - l, ")"); - ret += l; - } -end: - if (ret >= n) - return -ENOSPC; - return ret; -} - static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -347,11 +428,12 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (arg->fetch.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); - else if (arg->fetch.func == fetch_indirect) - free_indirect_fetch_data(arg->fetch.data); kfree(arg->name); + kfree(arg->comm); } static void free_trace_probe(struct trace_probe *tp) @@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_ARGS 16 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) { - ff->func = fetch_retvalue; - ff->data = NULL; - } else + if (is_return) + f->fn = t->retval; + else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - ff->func = fetch_stack_address; - ff->data = NULL; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; } else if (isdigit(arg[5])) { ret = strict_strtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - ff->func = fetch_stack; - ff->data = (void *)param; + f->fn = t->stack; + f->data = (void *)param; } } else ret = -EINVAL; @@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) } /* Recursive argument parser */ -static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int __parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; @@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, ff, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - ff->func = fetch_register; - ff->data = (void *)(unsigned long)ret; + f->fn = t->reg; + f->data = (void *)(unsigned long)ret; ret = 0; } break; @@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - ff->func = fetch_memory; - ff->data = (void *)param; + f->fn = t->memory; + f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); if (ret) break; - ff->data = alloc_symbol_cache(arg + 1, offset); - if (ff->data) - ff->func = fetch_symbol; - else - ret = -EINVAL; + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->symbol; } break; - case '+': /* indirect memory */ + case '+': /* deref memory */ case '-': tmp = strchr(arg, '('); - if (!tmp) { - ret = -EINVAL; + if (!tmp) break; - } *tmp = '\0'; ret = strict_strtol(arg + 1, 0, &offset); if (ret) @@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { - struct indirect_fetch_data *id; + struct deref_fetch_param *dprm; + const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - id = kzalloc(sizeof(struct indirect_fetch_data), - GFP_KERNEL); - if (!id) + dprm = kzalloc(sizeof(struct deref_fetch_param), + GFP_KERNEL); + if (!dprm) return -ENOMEM; - id->offset = offset; - ret = __parse_probe_arg(arg, &id->orig, is_return); + dprm->offset = offset; + ret = __parse_probe_arg(arg, t2, &dprm->orig, + is_return); if (ret) - kfree(id); + kfree(dprm); else { - ff->func = fetch_indirect; - ff->data = (void *)id; + f->fn = t->deref; + f->data = (void *)dprm; } - } else - ret = -EINVAL; + } break; - default: - /* TODO: support custom handler */ - ret = -EINVAL; } + if (!ret && !f->fn) + ret = -EINVAL; return ret; } /* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_arg(char *arg, struct trace_probe *tp, + struct probe_arg *parg, int is_return) { + const char *t; + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; } - return __parse_probe_arg(arg, ff, is_return); + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = tp->size; + tp->size += parg->type->size; + return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); } /* Return 1 if name is reserved or already used by another argument */ @@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG - * Indirect memory fetch: + * Dereferencing memory fetch: * +|-offs(ARG) : fetch memory at ARG +|- offs address. * Alias name of args: * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_probe *tp; int i, ret = 0; int is_return = 0, is_delete = 0; - char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg, *tmp; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv) else arg = argv[i]; - if (conflict_field_name(argv[i], tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { pr_info("Failed to allocate argument%d name '%s'.\n", @@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv) ret = -ENOMEM; goto error; } + tmp = strchr(tp->args[i].name, ':'); + if (tmp) + *tmp = '_'; /* convert : to _ */ + + if (conflict_field_name(tp->args[i].name, tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } /* Parse fetch argument */ - ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { pr_info("Parse error at argument%d. (%d)\n", i, ret); kfree(tp->args[i].name); @@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_probe *tp = v; - int i, ret; - char buf[MAX_ARGSTR_LEN + 1]; + int i; seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); @@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v) else seq_printf(m, " %s", probe_symbol(tp)); - for (i = 0; i < tp->nr_args; i++) { - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); - if (ret < 0) { - pr_warning("Argument%d decoding error(%d).\n", i, ret); - return ret; - } - seq_printf(m, " %s=%s", tp->args[i].name, buf); - } + for (i = 0; i < tp->nr_args; i++) + seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); seq_printf(m, "\n"); + return 0; } @@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = { static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); @@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); @@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags) { - struct kprobe_trace_entry *field; + struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kprobe_trace_entry *)iter->ent; + field = (struct kprobe_trace_entry_head *)iter->ent; event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); @@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1046,13 +1151,14 @@ partial: enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags) { - struct kretprobe_trace_entry *field; + struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kretprobe_trace_entry *)iter->ent; + field = (struct kretprobe_trace_entry_head *)iter->ent; event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); @@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call) static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kprobe_trace_entry field; + struct kprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kretprobe_trace_entry field; + struct kretprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } @@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", - tp->args[i].name); + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); @@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, if (!entry) return; - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } @@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, if (!entry) return; - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags, regs); -- cgit v1.1 From 95476b64ab11d528de2557366ec584977c215b9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Apr 2010 23:42:18 +0200 Subject: perf: Fix hlist related build error hlist helpers need to be available for all software events, not only trace events. Pull them out outside the ifdef CONFIG_EVENT_TRACING section. Fixes: kernel/perf_event.c:4573: error: implicit declaration of function 'swevent_hlist_put' kernel/perf_event.c:4614: error: implicit declaration of function 'swevent_hlist_get' kernel/perf_event.c:5534: error: implicit declaration of function 'swevent_hlist_release Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1271281338-23491-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 60 ++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 095101d..07b7a43 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4313,36 +4313,6 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_TRACING - -void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size, struct pt_regs *regs) -{ - struct perf_sample_data data; - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - perf_sample_data_init(&data, addr); - data.raw = &raw; - - /* Trace events already protected against recursion */ - do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); -} -EXPORT_SYMBOL_GPL(perf_tp_event); - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) { struct swevent_hlist *hlist; @@ -4442,6 +4412,36 @@ static int swevent_hlist_get(struct perf_event *event) return err; } +#ifdef CONFIG_EVENT_TRACING + +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size, struct pt_regs *regs) +{ + struct perf_sample_data data; + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr); + data.raw = &raw; + + /* Trace events already protected against recursion */ + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_disable(event->attr.config); -- cgit v1.1 From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 19 Apr 2010 13:32:41 +0800 Subject: perf: Enhance perf to allow for guest statistic collection from host Below patch introduces perf_guest_info_callbacks and related register/unregister functions. Add more PERF_RECORD_MISC_XXX bits meaning guest kernel and guest user space. Signed-off-by: Zhang Yanmin Signed-off-by: Avi Kivity --- kernel/perf_event.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 07b7a43..9dbe8cd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2798,6 +2798,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski /* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + +/* * Output */ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, @@ -3749,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) .event_id = { .header = { .type = PERF_RECORD_MMAP, - .misc = 0, + .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ -- cgit v1.1