From 49f474331e563a6ecf3b1e87ec27ec5482b3e4f1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 27 Dec 2009 11:51:52 +0100
Subject: perf events: Remove arg from perf sched hooks

Since we only ever schedule the local cpu, there is no need to pass the
cpu number to the perf sched hooks.

This micro-optimizes things a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 27 ++++++++++++++-------------
 kernel/sched.c      |  6 +++---
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 03cc061..099bd66 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1170,9 +1170,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
  * not restart the event.
  */
 void perf_event_task_sched_out(struct task_struct *task,
-				 struct task_struct *next, int cpu)
+				 struct task_struct *next)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = task->perf_event_ctxp;
 	struct perf_event_context *next_ctx;
 	struct perf_event_context *parent;
@@ -1252,8 +1252,9 @@ static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
 
 static void
 __perf_event_sched_in(struct perf_event_context *ctx,
-			struct perf_cpu_context *cpuctx, int cpu)
+			struct perf_cpu_context *cpuctx)
 {
+	int cpu = smp_processor_id();
 	struct perf_event *event;
 	int can_add_hw = 1;
 
@@ -1326,24 +1327,24 @@ __perf_event_sched_in(struct perf_event_context *ctx,
  * accessing the event control register. If a NMI hits, then it will
  * keep the event running.
  */
-void perf_event_task_sched_in(struct task_struct *task, int cpu)
+void perf_event_task_sched_in(struct task_struct *task)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = task->perf_event_ctxp;
 
 	if (likely(!ctx))
 		return;
 	if (cpuctx->task_ctx == ctx)
 		return;
-	__perf_event_sched_in(ctx, cpuctx, cpu);
+	__perf_event_sched_in(ctx, cpuctx);
 	cpuctx->task_ctx = ctx;
 }
 
-static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	__perf_event_sched_in(ctx, cpuctx, cpu);
+	__perf_event_sched_in(ctx, cpuctx);
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1461,7 +1462,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	raw_spin_unlock(&ctx->lock);
 }
 
-void perf_event_task_tick(struct task_struct *curr, int cpu)
+void perf_event_task_tick(struct task_struct *curr)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
@@ -1469,7 +1470,7 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
 	if (!atomic_read(&nr_events))
 		return;
 
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	cpuctx = &__get_cpu_var(perf_cpu_context);
 	ctx = curr->perf_event_ctxp;
 
 	perf_ctx_adjust_freq(&cpuctx->ctx);
@@ -1484,9 +1485,9 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
 	if (ctx)
 		rotate_ctx(ctx);
 
-	perf_event_cpu_sched_in(cpuctx, cpu);
+	perf_event_cpu_sched_in(cpuctx);
 	if (ctx)
-		perf_event_task_sched_in(curr, cpu);
+		perf_event_task_sched_in(curr);
 }
 
 /*
@@ -1527,7 +1528,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_task_sched_in(task, smp_processor_id());
+	perf_event_task_sched_in(task);
  out:
 	local_irq_restore(flags);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 18cceee..d6527ac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2752,7 +2752,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
-	perf_event_task_sched_in(current, cpu_of(rq));
+	perf_event_task_sched_in(current);
 	finish_lock_switch(rq, prev);
 
 	fire_sched_in_preempt_notifiers(current);
@@ -5266,7 +5266,7 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	raw_spin_unlock(&rq->lock);
 
-	perf_event_task_tick(curr, cpu);
+	perf_event_task_tick(curr);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
@@ -5480,7 +5480,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
-		perf_event_task_sched_out(prev, next, cpu);
+		perf_event_task_sched_out(prev, next);
 
 		rq->nr_switches++;
 		rq->curr = next;
-- 
cgit v1.1


From 07b139c8c81b97bbe55c68daf0cbeca8b1c609ca Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 21 Dec 2009 14:27:35 +0800
Subject: perf events: Remove CONFIG_EVENT_PROFILE

Quoted from Ingo:

| This reminds me - i think we should eliminate CONFIG_EVENT_PROFILE -
| it's an unnecessary Kconfig complication. If both PERF_EVENTS and
| EVENT_TRACING is enabled we should expose generic tracepoints.
|
| Nor is it limited to event 'profiling', so it has become a misnomer as
| well.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <4B2F1557.2050705@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c                |  4 ++--
 kernel/trace/Makefile              |  4 +++-
 kernel/trace/trace_events_filter.c |  4 ++--
 kernel/trace/trace_kprobe.c        | 14 +++++++-------
 kernel/trace/trace_syscalls.c      |  5 ++---
 5 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 099bd66..5b987b4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4177,7 +4177,7 @@ static const struct pmu perf_ops_task_clock = {
 	.read		= task_clock_perf_event_read,
 };
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_EVENT_TRACING
 
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 			  int entry_size)
@@ -4282,7 +4282,7 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 static void bp_perf_event_destroy(struct perf_event *event)
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd8..d00c6fe 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
-obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 50504cb..74563d7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1360,7 +1360,7 @@ out_unlock:
 	return err;
 }
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 
 void ftrace_profile_free_filter(struct perf_event *event)
 {
@@ -1428,5 +1428,5 @@ out_unlock:
 	return err;
 }
 
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_PERF_EVENTS */
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 375f81a..75d75de 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1249,7 +1249,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 					 ", REC->" FIELD_STRING_RETIP);
 }
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
 static __kprobes int kprobe_profile_func(struct kprobe *kp,
@@ -1407,7 +1407,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
 			disable_kprobe(&tp->rp.kp);
 	}
 }
-#endif	/* CONFIG_EVENT_PROFILE */
+#endif	/* CONFIG_PERF_EVENTS */
 
 
 static __kprobes
@@ -1417,10 +1417,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 
 	if (tp->flags & TP_FLAG_TRACE)
 		kprobe_trace_func(kp, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
 		kprobe_profile_func(kp, regs);
-#endif	/* CONFIG_EVENT_PROFILE */
+#endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 
@@ -1431,10 +1431,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 
 	if (tp->flags & TP_FLAG_TRACE)
 		kretprobe_trace_func(ri, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
 		kretprobe_profile_func(ri, regs);
-#endif	/* CONFIG_EVENT_PROFILE */
+#endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 
@@ -1463,7 +1463,7 @@ static int register_probe_event(struct trace_probe *tp)
 	call->regfunc = probe_event_enable;
 	call->unregfunc = probe_event_disable;
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 	call->profile_enable = probe_profile_enable;
 	call->profile_disable = probe_profile_disable;
 #endif
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f3..f694f66 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -421,7 +421,7 @@ int __init init_ftrace_syscalls(void)
 }
 core_initcall(init_ftrace_syscalls);
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 
 static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -626,6 +626,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-#endif
-
+#endif /* CONFIG_PERF_EVENTS */
 
-- 
cgit v1.1


From 14640106f243a3b29944d7198569090fa6546f2d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 5 Jan 2010 17:46:48 -0500
Subject: tracing/kprobe: Drop function argument access syntax

Drop function argument access syntax, because the function
arguments depend on not only architecture but also
compile-options and function API. And now, we have perf-probe
for finding register/memory assigned to each argument.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Neuling <mikey@neuling.org>
Cc: linuxppc-dev@ozlabs.org
LKML-Reference: <20100105224648.19431.52309.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 47f54ab..7ac728d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
 	return retval;
 }
 
-static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
-{
-	return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
-}
-
 static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
 					      void *dummy)
 {
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 {
 	int ret = -EINVAL;
 
-	if (ff->func == fetch_argument)
-		ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
-	else if (ff->func == fetch_register) {
+	if (ff->func == fetch_register) {
 		const char *name;
 		name = regs_query_register_name((unsigned int)((long)ff->data));
 		ret = snprintf(buf, n, "%%%s", name);
@@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
 			}
 		} else
 			ret = -EINVAL;
-	} else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
-		ret = strict_strtoul(arg + 3, 10, &param);
-		if (ret || param > PARAM_MAX_ARGS)
-			ret = -EINVAL;
-		else {
-			ff->func = fetch_argument;
-			ff->data = (void *)param;
-		}
 	} else
 		ret = -EINVAL;
 	return ret;
@@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
 	 *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
 	 *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
 	 * Fetch args:
-	 *  $argN	: fetch Nth of function argument. (N:0-)
 	 *  $retval	: fetch return value
 	 *  $stack	: fetch stack address
 	 *  $stackN	: fetch Nth of stack (N:0-)
-- 
cgit v1.1


From 8381f65d097dad90416808314737dd7d3ae38ea9 Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie.iles@picochip.com>
Date: Fri, 8 Jan 2010 15:27:33 +0000
Subject: sched/perf: Make sure irqs are disabled for
 perf_event_task_sched_in()

perf_event_task_sched_in() expects interrupts to be disabled,
but on architectures with __ARCH_WANT_INTERRUPTS_ON_CTXSW
defined, this isn't true. If this is defined, disable irqs
around the call in finish_task_switch().

Signed-off-by: Jamie Iles <jamie.iles@picochip.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
LKML-Reference: <1262964453-27370-1-git-send-email-jamie.iles@picochip.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e507af0..c3ad342 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2783,7 +2783,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 	perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 	finish_lock_switch(rq, prev);
 
 	fire_sched_in_preempt_notifiers(current);
-- 
cgit v1.1


From 889ff0150661512d79484219612b7e2e024b6c07 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 9 Jan 2010 20:04:47 +0100
Subject: perf/core: Split context's event group list into pinned and
 non-pinned lists

Split-up struct perf_event_context::group_list into pinned_groups
and flexible_groups (non-pinned).

This first appears to be useless as it duplicates various loops around
the group list handlings.

But it scales better in the fast-path in perf_sched_in(). We don't
anymore iterate twice through the entire list to separate pinned and
non-pinned scheduling. Instead we interate through two distinct lists.

The another desired effect is that it makes easier to define distinct
scheduling rules on both.

Changes in v2:
- Respectively rename pinned_grp_list and
  volatile_grp_list into pinned_groups and flexible_groups as per
  Ingo suggestion.
- Various cleanups

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
---
 kernel/perf_event.c | 227 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 151 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 27f69a04..c9f8a75 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -289,6 +289,15 @@ static void update_event_times(struct perf_event *event)
 	event->total_time_running = run_end - event->tstamp_running;
 }
 
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+	if (event->attr.pinned)
+		return &ctx->pinned_groups;
+	else
+		return &ctx->flexible_groups;
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +312,12 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * add it straight to the context's event list, or to the group
 	 * leader's sibling list:
 	 */
-	if (group_leader == event)
-		list_add_tail(&event->group_entry, &ctx->group_list);
-	else {
+	if (group_leader == event) {
+		struct list_head *list;
+
+		list = ctx_group_list(event, ctx);
+		list_add_tail(&event->group_entry, list);
+	} else {
 		list_add_tail(&event->group_entry, &group_leader->sibling_list);
 		group_leader->nr_siblings++;
 	}
@@ -355,8 +367,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * to the context list directly:
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+		struct list_head *list;
 
-		list_move_tail(&sibling->group_entry, &ctx->group_list);
+		list = ctx_group_list(event, ctx);
+		list_move_tail(&sibling->group_entry, list);
 		sibling->group_leader = sibling;
 	}
 }
@@ -1056,7 +1070,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
 
 	perf_disable();
 	if (ctx->nr_active) {
-		list_for_each_entry(event, &ctx->group_list, group_entry)
+		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+			group_sched_out(event, cpuctx, ctx);
+
+		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
 	perf_enable();
@@ -1271,9 +1288,8 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF ||
-		    !event->attr.pinned)
+	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
 		if (event->cpu != -1 && event->cpu != cpu)
 			continue;
@@ -1291,15 +1307,10 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 		}
 	}
 
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		/*
-		 * Ignore events in OFF or ERROR state, and
-		 * ignore pinned events since we did them already.
-		 */
-		if (event->state <= PERF_EVENT_STATE_OFF ||
-		    event->attr.pinned)
+	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+		/* Ignore events in OFF or ERROR state */
+		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
@@ -1453,8 +1464,13 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	 * Rotate the first entry last (works just fine for group events too):
 	 */
 	perf_disable();
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		list_move_tail(&event->group_entry, &ctx->group_list);
+	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+		list_move_tail(&event->group_entry, &ctx->pinned_groups);
+		break;
+	}
+
+	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+		list_move_tail(&event->group_entry, &ctx->flexible_groups);
 		break;
 	}
 	perf_enable();
@@ -1490,6 +1506,21 @@ void perf_event_task_tick(struct task_struct *curr)
 		perf_event_task_sched_in(curr);
 }
 
+static int event_enable_on_exec(struct perf_event *event,
+				struct perf_event_context *ctx)
+{
+	if (!event->attr.enable_on_exec)
+		return 0;
+
+	event->attr.enable_on_exec = 0;
+	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+		return 0;
+
+	__perf_event_mark_enabled(event, ctx);
+
+	return 1;
+}
+
 /*
  * Enable all of a task's events that have been marked enable-on-exec.
  * This expects task == current.
@@ -1500,6 +1531,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 	struct perf_event *event;
 	unsigned long flags;
 	int enabled = 0;
+	int ret;
 
 	local_irq_save(flags);
 	ctx = task->perf_event_ctxp;
@@ -1510,14 +1542,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 
 	raw_spin_lock(&ctx->lock);
 
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		if (!event->attr.enable_on_exec)
-			continue;
-		event->attr.enable_on_exec = 0;
-		if (event->state >= PERF_EVENT_STATE_INACTIVE)
-			continue;
-		__perf_event_mark_enabled(event, ctx);
-		enabled = 1;
+	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+		ret = event_enable_on_exec(event, ctx);
+		if (ret)
+			enabled = 1;
+	}
+
+	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+		ret = event_enable_on_exec(event, ctx);
+		if (ret)
+			enabled = 1;
 	}
 
 	/*
@@ -1591,7 +1625,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
 {
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->group_list);
+	INIT_LIST_HEAD(&ctx->pinned_groups);
+	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 	ctx->task = task;
@@ -5032,7 +5067,11 @@ void perf_event_exit_task(struct task_struct *child)
 	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 
 again:
-	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
+				 group_entry)
+		__perf_event_exit_task(child_event, child_ctx, child);
+
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
 				 group_entry)
 		__perf_event_exit_task(child_event, child_ctx, child);
 
@@ -5041,7 +5080,8 @@ again:
 	 * its siblings to the list, but we obtained 'tmp' before that which
 	 * will still point to the list head terminating the iteration.
 	 */
-	if (!list_empty(&child_ctx->group_list))
+	if (!list_empty(&child_ctx->pinned_groups) ||
+	    !list_empty(&child_ctx->flexible_groups))
 		goto again;
 
 	mutex_unlock(&child_ctx->mutex);
@@ -5049,6 +5089,24 @@ again:
 	put_ctx(child_ctx);
 }
 
+static void perf_free_event(struct perf_event *event,
+			    struct perf_event_context *ctx)
+{
+	struct perf_event *parent = event->parent;
+
+	if (WARN_ON_ONCE(!parent))
+		return;
+
+	mutex_lock(&parent->child_mutex);
+	list_del_init(&event->child_list);
+	mutex_unlock(&parent->child_mutex);
+
+	fput(parent->filp);
+
+	list_del_event(event, ctx);
+	free_event(event);
+}
+
 /*
  * free an unexposed, unused context as created by inheritance by
  * init_task below, used by fork() in case of fail.
@@ -5063,36 +5121,70 @@ void perf_event_free_task(struct task_struct *task)
 
 	mutex_lock(&ctx->mutex);
 again:
-	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
-		struct perf_event *parent = event->parent;
+	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+		perf_free_event(event, ctx);
 
-		if (WARN_ON_ONCE(!parent))
-			continue;
+	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+				 group_entry)
+		perf_free_event(event, ctx);
 
-		mutex_lock(&parent->child_mutex);
-		list_del_init(&event->child_list);
-		mutex_unlock(&parent->child_mutex);
+	if (!list_empty(&ctx->pinned_groups) ||
+	    !list_empty(&ctx->flexible_groups))
+		goto again;
 
-		fput(parent->filp);
+	mutex_unlock(&ctx->mutex);
 
-		list_del_event(event, ctx);
-		free_event(event);
+	put_ctx(ctx);
+}
+
+static int
+inherit_task_group(struct perf_event *event, struct task_struct *parent,
+		   struct perf_event_context *parent_ctx,
+		   struct task_struct *child,
+		   int *inherited_all)
+{
+	int ret;
+	struct perf_event_context *child_ctx = child->perf_event_ctxp;
+
+	if (!event->attr.inherit) {
+		*inherited_all = 0;
+		return 0;
 	}
 
-	if (!list_empty(&ctx->group_list))
-		goto again;
+	if (!child_ctx) {
+		/*
+		 * This is executed from the parent task context, so
+		 * inherit events that have been marked for cloning.
+		 * First allocate and initialize a context for the
+		 * child.
+		 */
 
-	mutex_unlock(&ctx->mutex);
+		child_ctx = kzalloc(sizeof(struct perf_event_context),
+				    GFP_KERNEL);
+		if (!child_ctx)
+			return -ENOMEM;
 
-	put_ctx(ctx);
+		__perf_event_init_context(child_ctx, child);
+		child->perf_event_ctxp = child_ctx;
+		get_task_struct(child);
+	}
+
+	ret = inherit_group(event, parent, parent_ctx,
+			    child, child_ctx);
+
+	if (ret)
+		*inherited_all = 0;
+
+	return ret;
 }
 
+
 /*
  * Initialize the perf_event context in task_struct
  */
 int perf_event_init_task(struct task_struct *child)
 {
-	struct perf_event_context *child_ctx = NULL, *parent_ctx;
+	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
 	struct perf_event *event;
 	struct task_struct *parent = current;
@@ -5130,41 +5222,22 @@ int perf_event_init_task(struct task_struct *child)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
-
-		if (!event->attr.inherit) {
-			inherited_all = 0;
-			continue;
-		}
-
-		if (!child->perf_event_ctxp) {
-			/*
-			 * This is executed from the parent task context, so
-			 * inherit events that have been marked for cloning.
-			 * First allocate and initialize a context for the
-			 * child.
-			 */
-
-			child_ctx = kzalloc(sizeof(struct perf_event_context),
-					    GFP_KERNEL);
-			if (!child_ctx) {
-				ret = -ENOMEM;
-				break;
-			}
-
-			__perf_event_init_context(child_ctx, child);
-			child->perf_event_ctxp = child_ctx;
-			get_task_struct(child);
-		}
+	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+		ret = inherit_task_group(event, parent, parent_ctx, child,
+					 &inherited_all);
+		if (ret)
+			break;
+	}
 
-		ret = inherit_group(event, parent, parent_ctx,
-					     child, child_ctx);
-		if (ret) {
-			inherited_all = 0;
+	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+		ret = inherit_task_group(event, parent, parent_ctx, child,
+					 &inherited_all);
+		if (ret)
 			break;
-		}
 	}
 
+	child_ctx = child->perf_event_ctxp;
+
 	if (child_ctx && inherited_all) {
 		/*
 		 * Mark the child context as a clone of the parent
@@ -5213,7 +5286,9 @@ static void __perf_event_exit_cpu(void *info)
 	struct perf_event_context *ctx = &cpuctx->ctx;
 	struct perf_event *event, *tmp;
 
-	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+		__perf_event_remove_from_context(event);
+	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
 		__perf_event_remove_from_context(event);
 }
 static void perf_event_exit_cpu(int cpu)
-- 
cgit v1.1


From e286417378b4f9ce6e473b556193465ab22e12ab Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 9 Jan 2010 21:05:28 +0100
Subject: perf: Round robin flexible groups of events using list_rotate_left()

This is more proper that doing it through a list_for_each_entry()
that breaks after the first entry.

v2: Don't rotate pinned groups as its not needed to time share
them.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
---
 kernel/perf_event.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c9f8a75..bbebe28 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1454,25 +1454,16 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
  */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-	struct perf_event *event;
-
 	if (!ctx->nr_events)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	/*
-	 * Rotate the first entry last (works just fine for group events too):
-	 */
+
+	/* Rotate the first entry last of non-pinned groups */
 	perf_disable();
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-		list_move_tail(&event->group_entry, &ctx->pinned_groups);
-		break;
-	}
 
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-		list_move_tail(&event->group_entry, &ctx->flexible_groups);
-		break;
-	}
+	list_rotate_left(&ctx->flexible_groups);
+
 	perf_enable();
 
 	raw_spin_unlock(&ctx->lock);
-- 
cgit v1.1


From d6f962b57bfaab62891c7abbf1469212a56d6103 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 10 Jan 2010 01:25:51 +0100
Subject: perf: Export software-only event group characteristic as a flag

Before scheduling an event group, we first check if a group can go
on. We first check if the group is made of software only events
first, in which case it is enough to know if the group can be
scheduled in.

For that purpose, we iterate through the whole group, which is
wasteful as we could do this check when we add/delete an event to
a group.

So we create a group_flags field in perf event that can host
characteristics from a group of events, starting with a first
PERF_GROUP_SOFTWARE flag that reduces the check on the fast path.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
---
 kernel/perf_event.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index bbebe28..eae6ff6 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -315,9 +315,16 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (group_leader == event) {
 		struct list_head *list;
 
+		if (is_software_event(event))
+			event->group_flags |= PERF_GROUP_SOFTWARE;
+
 		list = ctx_group_list(event, ctx);
 		list_add_tail(&event->group_entry, list);
 	} else {
+		if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+		    !is_software_event(event))
+			group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
 		list_add_tail(&event->group_entry, &group_leader->sibling_list);
 		group_leader->nr_siblings++;
 	}
@@ -372,6 +379,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 		list = ctx_group_list(event, ctx);
 		list_move_tail(&sibling->group_entry, list);
 		sibling->group_leader = sibling;
+
+		/* Inherit group flags from the previous leader */
+		sibling->group_flags = event->group_flags;
 	}
 }
 
@@ -700,24 +710,6 @@ group_error:
 }
 
 /*
- * Return 1 for a group consisting entirely of software events,
- * 0 if the group contains any hardware events.
- */
-static int is_software_only_group(struct perf_event *leader)
-{
-	struct perf_event *event;
-
-	if (!is_software_event(leader))
-		return 0;
-
-	list_for_each_entry(event, &leader->sibling_list, group_entry)
-		if (!is_software_event(event))
-			return 0;
-
-	return 1;
-}
-
-/*
  * Work out whether we can put this event group on the CPU now.
  */
 static int group_can_go_on(struct perf_event *event,
@@ -727,7 +719,7 @@ static int group_can_go_on(struct perf_event *event,
 	/*
 	 * Groups consisting entirely of software events can always go on.
 	 */
-	if (is_software_only_group(event))
+	if (event->group_flags & PERF_GROUP_SOFTWARE)
 		return 1;
 	/*
 	 * If an exclusive group is already on, no other hardware
-- 
cgit v1.1


From 231e36f4d2e63dd770db80b9f5113310c2bcfcfd Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 14 Jan 2010 00:12:12 -0500
Subject: tracing/kprobe: Update kprobe tracing self test for new syntax

Update kprobe tracing self test for new syntax (it supports
deleting individual probes, and drops $argN support)
and behavior change (new probes are disabled in default).

This selftest includes the following checks:

 - Adding function-entry probe and return probe with arguments.
 - Enabling these probes.
 - Deleting it individually.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100114051211.7814.29436.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_kprobe.c | 55 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7ac728d..d6266ca 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1507,28 +1507,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
 
 static __init int kprobe_trace_self_tests_init(void)
 {
-	int ret;
+	int ret, warn = 0;
 	int (*target)(int, int, int, int, int, int);
+	struct trace_probe *tp;
 
 	target = kprobe_trace_selftest_target;
 
 	pr_info("Testing kprobe tracing: ");
 
 	ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
-				  "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
-	if (WARN_ON_ONCE(ret))
-		pr_warning("error enabling function entry\n");
+				  "$stack $stack0 +0($stack)");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on probing function entry.\n");
+		warn++;
+	} else {
+		/* Enable trace point */
+		tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+		if (WARN_ON_ONCE(tp == NULL)) {
+			pr_warning("error on getting new probe.\n");
+			warn++;
+		} else
+			probe_event_enable(&tp->call);
+	}
 
 	ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
 				  "$retval");
-	if (WARN_ON_ONCE(ret))
-		pr_warning("error enabling function return\n");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on probing function return.\n");
+		warn++;
+	} else {
+		/* Enable trace point */
+		tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+		if (WARN_ON_ONCE(tp == NULL)) {
+			pr_warning("error on getting new probe.\n");
+			warn++;
+		} else
+			probe_event_enable(&tp->call);
+	}
+
+	if (warn)
+		goto end;
 
 	ret = target(1, 2, 3, 4, 5, 6);
 
-	cleanup_all_probes();
+	ret = command_trace_probe("-:testprobe");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on deleting a probe.\n");
+		warn++;
+	}
+
+	ret = command_trace_probe("-:testprobe2");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on deleting a probe.\n");
+		warn++;
+	}
 
-	pr_cont("OK\n");
+end:
+	cleanup_all_probes();
+	if (warn)
+		pr_cont("NG: Some tests are failed. Please check them.\n");
+	else
+		pr_cont("OK\n");
 	return 0;
 }
 
-- 
cgit v1.1


From 42cce92f4ddfa41e2dfe26fdcad4887943c032f2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 17 Jan 2010 10:36:08 +0100
Subject: perf: Make __perf_event_sched_out static

__perf_event_sched_out doesn't need to be globally available, make
it static.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
---
 kernel/perf_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eae6ff6..c4e90b8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1049,8 +1049,8 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
 	return 0;
 }
 
-void __perf_event_sched_out(struct perf_event_context *ctx,
-			      struct perf_cpu_context *cpuctx)
+static void __perf_event_sched_out(struct perf_event_context *ctx,
+				   struct perf_cpu_context *cpuctx)
 {
 	struct perf_event *event;
 
-- 
cgit v1.1


From 5b0311e1f2464547fc6f17a82d7ea2538c8c7a70 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 17 Jan 2010 11:59:13 +0100
Subject: perf: Allow pinned and flexible groups to be scheduled separately

Tune the scheduling helpers so that we can choose to schedule either
pinned and/or flexible groups from a context.

And while at it, refactor a bit the naming of these helpers to make
these more consistent and flexible.

There is no (intended) change in scheduling behaviour in this
patch.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
---
 kernel/perf_event.c | 137 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 93 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c4e90b8..bfc4ee0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1049,8 +1049,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
 	return 0;
 }
 
-static void __perf_event_sched_out(struct perf_event_context *ctx,
-				   struct perf_cpu_context *cpuctx)
+enum event_type_t {
+	EVENT_FLEXIBLE = 0x1,
+	EVENT_PINNED = 0x2,
+	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
+static void ctx_sched_out(struct perf_event_context *ctx,
+			  struct perf_cpu_context *cpuctx,
+			  enum event_type_t event_type)
 {
 	struct perf_event *event;
 
@@ -1061,13 +1068,18 @@ static void __perf_event_sched_out(struct perf_event_context *ctx,
 	update_context_time(ctx);
 
 	perf_disable();
-	if (ctx->nr_active) {
+	if (!ctx->nr_active)
+		goto out_enable;
+
+	if (event_type & EVENT_PINNED)
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 
+	if (event_type & EVENT_FLEXIBLE)
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
-	}
+
+ out_enable:
 	perf_enable();
  out:
 	raw_spin_unlock(&ctx->lock);
@@ -1229,15 +1241,13 @@ void perf_event_task_sched_out(struct task_struct *task,
 	rcu_read_unlock();
 
 	if (do_switch) {
-		__perf_event_sched_out(ctx, cpuctx);
+		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
 		cpuctx->task_ctx = NULL;
 	}
 }
 
-/*
- * Called with IRQs disabled
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+			       enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
@@ -1247,39 +1257,34 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	__perf_event_sched_out(ctx, cpuctx);
+	ctx_sched_out(ctx, cpuctx, event_type);
 	cpuctx->task_ctx = NULL;
 }
 
 /*
  * Called with IRQs disabled
  */
-static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 {
-	__perf_event_sched_out(&cpuctx->ctx, cpuctx);
+	task_ctx_sched_out(ctx, EVENT_ALL);
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+			      enum event_type_t event_type)
+{
+	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 
 static void
-__perf_event_sched_in(struct perf_event_context *ctx,
-			struct perf_cpu_context *cpuctx)
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+		    struct perf_cpu_context *cpuctx,
+		    int cpu)
 {
-	int cpu = smp_processor_id();
 	struct perf_event *event;
-	int can_add_hw = 1;
-
-	raw_spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	if (likely(!ctx->nr_events))
-		goto out;
-
-	ctx->timestamp = perf_clock();
-
-	perf_disable();
 
-	/*
-	 * First go through the list and put on any pinned groups
-	 * in order to give them the best chance of going on.
-	 */
 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
@@ -1298,6 +1303,15 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 			event->state = PERF_EVENT_STATE_ERROR;
 		}
 	}
+}
+
+static void
+ctx_flexible_sched_in(struct perf_event_context *ctx,
+		      struct perf_cpu_context *cpuctx,
+		      int cpu)
+{
+	struct perf_event *event;
+	int can_add_hw = 1;
 
 	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
 		/* Ignore events in OFF or ERROR state */
@@ -1314,11 +1328,53 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 			if (group_sched_in(event, cpuctx, ctx, cpu))
 				can_add_hw = 0;
 	}
+}
+
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+	     struct perf_cpu_context *cpuctx,
+	     enum event_type_t event_type)
+{
+	int cpu = smp_processor_id();
+
+	raw_spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	if (likely(!ctx->nr_events))
+		goto out;
+
+	ctx->timestamp = perf_clock();
+
+	perf_disable();
+
+	/*
+	 * First go through the list and put on any pinned groups
+	 * in order to give them the best chance of going on.
+	 */
+	if (event_type & EVENT_PINNED)
+		ctx_pinned_sched_in(ctx, cpuctx, cpu);
+
+	/* Then walk through the lower prio flexible groups */
+	if (event_type & EVENT_FLEXIBLE)
+		ctx_flexible_sched_in(ctx, cpuctx, cpu);
+
 	perf_enable();
  out:
 	raw_spin_unlock(&ctx->lock);
 }
 
+static void task_ctx_sched_in(struct task_struct *task,
+			      enum event_type_t event_type)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+
+	if (likely(!ctx))
+		return;
+	if (cpuctx->task_ctx == ctx)
+		return;
+	ctx_sched_in(ctx, cpuctx, event_type);
+	cpuctx->task_ctx = ctx;
+}
 /*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
@@ -1332,22 +1388,15 @@ __perf_event_sched_in(struct perf_event_context *ctx,
  */
 void perf_event_task_sched_in(struct task_struct *task)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_event_context *ctx = task->perf_event_ctxp;
-
-	if (likely(!ctx))
-		return;
-	if (cpuctx->task_ctx == ctx)
-		return;
-	__perf_event_sched_in(ctx, cpuctx);
-	cpuctx->task_ctx = ctx;
+	task_ctx_sched_in(task, EVENT_ALL);
 }
 
-static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx)
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+			     enum event_type_t event_type)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	__perf_event_sched_in(ctx, cpuctx);
+	ctx_sched_in(ctx, cpuctx, event_type);
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1476,17 +1525,17 @@ void perf_event_task_tick(struct task_struct *curr)
 	if (ctx)
 		perf_ctx_adjust_freq(ctx);
 
-	perf_event_cpu_sched_out(cpuctx);
+	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 	if (ctx)
-		__perf_event_task_sched_out(ctx);
+		task_ctx_sched_out(ctx, EVENT_ALL);
 
 	rotate_ctx(&cpuctx->ctx);
 	if (ctx)
 		rotate_ctx(ctx);
 
-	perf_event_cpu_sched_in(cpuctx);
+	cpu_ctx_sched_in(cpuctx, EVENT_ALL);
 	if (ctx)
-		perf_event_task_sched_in(curr);
+		task_ctx_sched_in(curr, EVENT_ALL);
 }
 
 static int event_enable_on_exec(struct perf_event *event,
-- 
cgit v1.1


From 7defb0f879bbcfe29e3c6f29d685d4f29b7a0700 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 17 Jan 2010 12:15:31 +0100
Subject: perf: Don't schedule out/in pinned events on task tick

We don't need to schedule in/out pinned events on task tick,
now that pinned and flexible groups can be scheduled separately.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
---
 kernel/perf_event.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index bfc4ee0..a90ae69 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1525,17 +1525,17 @@ void perf_event_task_tick(struct task_struct *curr)
 	if (ctx)
 		perf_ctx_adjust_freq(ctx);
 
-	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		task_ctx_sched_out(ctx, EVENT_ALL);
+		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
 
 	rotate_ctx(&cpuctx->ctx);
 	if (ctx)
 		rotate_ctx(ctx);
 
-	cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		task_ctx_sched_in(curr, EVENT_ALL);
+		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
 }
 
 static int event_enable_on_exec(struct perf_event *event,
-- 
cgit v1.1


From 329c0e012b99fa2325a0be205c052e4aba690f16 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 17 Jan 2010 12:56:05 +0100
Subject: perf: Better order flexible and pinned scheduling

When a task gets scheduled in. We don't touch the cpu bound events
so the priority order becomes:

	cpu pinned, cpu flexible, task pinned, task flexible.

So schedule out cpu flexibles when a new task context gets in
and correctly order the groups to schedule in:

	task pinned, cpu flexible, task flexible.

Cpu pinned groups don't need to be touched at this time.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
---
 kernel/perf_event.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a90ae69..edc46b9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1362,6 +1362,14 @@ ctx_sched_in(struct perf_event_context *ctx,
 	raw_spin_unlock(&ctx->lock);
 }
 
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+			     enum event_type_t event_type)
+{
+	struct perf_event_context *ctx = &cpuctx->ctx;
+
+	ctx_sched_in(ctx, cpuctx, event_type);
+}
+
 static void task_ctx_sched_in(struct task_struct *task,
 			      enum event_type_t event_type)
 {
@@ -1388,15 +1396,27 @@ static void task_ctx_sched_in(struct task_struct *task,
  */
 void perf_event_task_sched_in(struct task_struct *task)
 {
-	task_ctx_sched_in(task, EVENT_ALL);
-}
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
 
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type)
-{
-	struct perf_event_context *ctx = &cpuctx->ctx;
+	if (likely(!ctx))
+		return;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	if (cpuctx->task_ctx == ctx)
+		return;
+
+	/*
+	 * We want to keep the following priority order:
+	 * cpu pinned (that don't need to move), task pinned,
+	 * cpu flexible, task flexible.
+	 */
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
+	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+
+	cpuctx->task_ctx = ctx;
 }
 
 #define MAX_INTERRUPTS (~0ULL)
-- 
cgit v1.1


From abd50713944c8ea9e0af5b7bffa0aacae21cc91a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Jan 2010 18:50:16 +0100
Subject: perf: Reimplement frequency driven sampling

There was a bug in the old period code that caused intel_pmu_enable_all()
or native_write_msr_safe() to show up quite high in the profiles.

In staring at that code it made my head hurt, so I rewrote it in a
hopefully simpler fashion. Its now fully symetric between tick and
overflow driven adjustments and uses less data to boot.

The only complication is that it basically wants to do a u128 division.
The code approximates that in a rather simple truncate until it fits
fashion, taking care to balance the terms while truncating.

This version does not generate that sampling artefact.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 132 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 92 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index edc46b9..251fb95 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1423,14 +1423,83 @@ void perf_event_task_sched_in(struct task_struct *task)
 
 static void perf_log_throttle(struct perf_event *event, int enable);
 
-static void perf_adjust_period(struct perf_event *event, u64 events)
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
+{
+	u64 frequency = event->attr.sample_freq;
+	u64 sec = NSEC_PER_SEC;
+	u64 divisor, dividend;
+
+	int count_fls, nsec_fls, frequency_fls, sec_fls;
+
+	count_fls = fls64(count);
+	nsec_fls = fls64(nsec);
+	frequency_fls = fls64(frequency);
+	sec_fls = 30;
+
+	/*
+	 * We got @count in @nsec, with a target of sample_freq HZ
+	 * the target period becomes:
+	 *
+	 *             @count * 10^9
+	 * period = -------------------
+	 *          @nsec * sample_freq
+	 *
+	 */
+
+	/*
+	 * Reduce accuracy by one bit such that @a and @b converge
+	 * to a similar magnitude.
+	 */
+#define REDUCE_FLS(a, b) 		\
+do {					\
+	if (a##_fls > b##_fls) {	\
+		a >>= 1;		\
+		a##_fls--;		\
+	} else {			\
+		b >>= 1;		\
+		b##_fls--;		\
+	}				\
+} while (0)
+
+	/*
+	 * Reduce accuracy until either term fits in a u64, then proceed with
+	 * the other, so that finally we can do a u64/u64 division.
+	 */
+	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+		REDUCE_FLS(nsec, frequency);
+		REDUCE_FLS(sec, count);
+	}
+
+	if (count_fls + sec_fls > 64) {
+		divisor = nsec * frequency;
+
+		while (count_fls + sec_fls > 64) {
+			REDUCE_FLS(count, sec);
+			divisor >>= 1;
+		}
+
+		dividend = count * sec;
+	} else {
+		dividend = count * sec;
+
+		while (nsec_fls + frequency_fls > 64) {
+			REDUCE_FLS(nsec, frequency);
+			dividend >>= 1;
+		}
+
+		divisor = nsec * frequency;
+	}
+
+	return div64_u64(dividend, divisor);
+}
+
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	u64 period, sample_period;
 	s64 delta;
 
-	events *= hwc->sample_period;
-	period = div64_u64(events, event->attr.sample_freq);
+	period = perf_calculate_period(event, nsec, count);
 
 	delta = (s64)(period - hwc->sample_period);
 	delta = (delta + 7) / 8; /* low pass filter */
@@ -1441,13 +1510,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
 		sample_period = 1;
 
 	hwc->sample_period = sample_period;
+
+	if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+		perf_disable();
+		event->pmu->disable(event);
+		atomic64_set(&hwc->period_left, 0);
+		event->pmu->enable(event);
+		perf_enable();
+	}
 }
 
 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 {
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
-	u64 interrupts, freq;
+	u64 interrupts, now;
+	s64 delta;
 
 	raw_spin_lock(&ctx->lock);
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1468,44 +1546,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(event, 1);
 			event->pmu->unthrottle(event);
-			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
-		/*
-		 * if the specified freq < HZ then we need to skip ticks
-		 */
-		if (event->attr.sample_freq < HZ) {
-			freq = event->attr.sample_freq;
-
-			hwc->freq_count += freq;
-			hwc->freq_interrupts += interrupts;
-
-			if (hwc->freq_count < HZ)
-				continue;
-
-			interrupts = hwc->freq_interrupts;
-			hwc->freq_interrupts = 0;
-			hwc->freq_count -= HZ;
-		} else
-			freq = HZ;
-
-		perf_adjust_period(event, freq * interrupts);
+		event->pmu->read(event);
+		now = atomic64_read(&event->count);
+		delta = now - hwc->freq_count_stamp;
+		hwc->freq_count_stamp = now;
 
-		/*
-		 * In order to avoid being stalled by an (accidental) huge
-		 * sample period, force reset the sample period if we didn't
-		 * get any events in this freq period.
-		 */
-		if (!interrupts) {
-			perf_disable();
-			event->pmu->disable(event);
-			atomic64_set(&hwc->period_left, 0);
-			event->pmu->enable(event);
-			perf_enable();
-		}
+		if (delta > 0)
+			perf_adjust_period(event, TICK_NSEC, delta);
 	}
 	raw_spin_unlock(&ctx->lock);
 }
@@ -3768,12 +3820,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 
 	if (event->attr.freq) {
 		u64 now = perf_clock();
-		s64 delta = now - hwc->freq_stamp;
+		s64 delta = now - hwc->freq_time_stamp;
 
-		hwc->freq_stamp = now;
+		hwc->freq_time_stamp = now;
 
-		if (delta > 0 && delta < TICK_NSEC)
-			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+		if (delta > 0 && delta < 2*TICK_NSEC)
+			perf_adjust_period(event, delta, hwc->last_period);
 	}
 
 	/*
-- 
cgit v1.1


From 430ad5a600a83956749307b13257c464c3826b55 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 28 Jan 2010 09:32:29 +0800
Subject: perf: Factorize trace events raw sample buffer operations

Introduce ftrace_perf_buf_prepare() and ftrace_perf_buf_submit() to
gather the common code that operates on raw events sampling buffer.
This cleans up redundant code between regular trace events, syscall
events and kprobe events.

Changelog v1->v2:
- Rename function name as per Masami and Frederic's suggestion
- Add __kprobes for ftrace_perf_buf_prepare() and make
  ftrace_perf_buf_submit() inline as per Masami's suggestion
- Export ftrace_perf_buf_prepare since modules will use it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4B60E92D.9000808@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_event_profile.c | 52 ++++++++++++++++++++---
 kernel/trace/trace_kprobe.c        | 86 +++++---------------------------------
 kernel/trace/trace_syscalls.c      | 71 +++++--------------------------
 3 files changed, 67 insertions(+), 142 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573..f0d6930 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
  */
 
 #include <linux/module.h>
+#include <linux/kprobes.h>
 #include "trace.h"
 
 
-char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
-
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
+static char *perf_trace_buf;
+static char *perf_trace_buf_nmi;
 
 typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
 
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
 	}
 	mutex_unlock(&event_mutex);
 }
+
+__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
+					int *rctxp, unsigned long *irq_flags)
+{
+	struct trace_entry *entry;
+	char *trace_buf, *raw_data;
+	int pc, cpu;
+
+	pc = preempt_count();
+
+	/* Protect the per cpu buffer, begin the rcu read side */
+	local_irq_save(*irq_flags);
+
+	*rctxp = perf_swevent_get_recursion_context();
+	if (*rctxp < 0)
+		goto err_recursion;
+
+	cpu = smp_processor_id();
+
+	if (in_nmi())
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
+	else
+		trace_buf = rcu_dereference(perf_trace_buf);
+
+	if (!trace_buf)
+		goto err;
+
+	raw_data = per_cpu_ptr(trace_buf, cpu);
+
+	/* zero the dead bytes from align to not leak stack to user */
+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+	entry = (struct trace_entry *)raw_data;
+	tracing_generic_entry_update(entry, *irq_flags, pc);
+	entry->type = type;
+
+	return raw_data;
+err:
+	perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+	local_irq_restore(*irq_flags);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d6266ca..2e28ee3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1243,14 +1243,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
-	struct trace_entry *ent;
-	int size, __size, i, pc, __cpu;
+	int size, __size, i;
 	unsigned long irq_flags;
-	char *trace_buf;
-	char *raw_data;
 	int rctx;
 
-	pc = preempt_count();
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
@@ -1258,45 +1254,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 		     "profile buffer not large enough"))
 		return 0;
 
-	/*
-	 * Protect the non nmi buffer
-	 * This also protects the rcu read side
-	 */
-	local_irq_save(irq_flags);
-
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	__cpu = smp_processor_id();
-
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, __cpu);
-
-	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-	entry = (struct kprobe_trace_entry *)raw_data;
-	ent = &entry->ent;
+	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+	if (!entry)
+		return 0;
 
-	tracing_generic_entry_update(ent, irq_flags, pc);
-	ent->type = call->id;
 	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-	perf_tp_event(call->id, entry->ip, 1, entry, size);
 
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(irq_flags);
+	ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
 
 	return 0;
 }
@@ -1308,14 +1275,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
-	struct trace_entry *ent;
-	int size, __size, i, pc, __cpu;
+	int size, __size, i;
 	unsigned long irq_flags;
-	char *trace_buf;
-	char *raw_data;
 	int rctx;
 
-	pc = preempt_count();
 	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
@@ -1323,46 +1286,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 		     "profile buffer not large enough"))
 		return 0;
 
-	/*
-	 * Protect the non nmi buffer
-	 * This also protects the rcu read side
-	 */
-	local_irq_save(irq_flags);
-
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	__cpu = smp_processor_id();
-
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, __cpu);
-
-	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-	entry = (struct kretprobe_trace_entry *)raw_data;
-	ent = &entry->ent;
+	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+	if (!entry)
+		return 0;
 
-	tracing_generic_entry_update(ent, irq_flags, pc);
-	ent->type = call->id;
 	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
 
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(irq_flags);
+	ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f694f66..4e332b9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	unsigned long flags;
-	char *trace_buf;
-	char *raw_data;
 	int syscall_nr;
 	int rctx;
 	int size;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 		      "profile buffer not large enough"))
 		return;
 
-	/* Protect the per cpu buffer, begin the rcu read side */
-	local_irq_save(flags);
-
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	cpu = smp_processor_id();
-
-	trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, cpu);
-
-	/* zero the dead bytes from align to not leak stack to user */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+	rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+				sys_data->enter_event->id, &rctx, &flags);
+	if (!rec)
+		return;
 
-	rec = (struct syscall_trace_enter *) raw_data;
-	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->enter_event->id;
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
-
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(flags);
+	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
 }
 
 int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	struct syscall_trace_exit *rec;
 	unsigned long flags;
 	int syscall_nr;
-	char *trace_buf;
-	char *raw_data;
 	int rctx;
 	int size;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 		"exit event has grown above profile buffer size"))
 		return;
 
-	/* Protect the per cpu buffer, begin the rcu read side */
-	local_irq_save(flags);
-
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	cpu = smp_processor_id();
-
-	trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, cpu);
-
-	/* zero the dead bytes from align to not leak stack to user */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-
-	rec = (struct syscall_trace_exit *)raw_data;
+	rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+				sys_data->exit_event->id, &rctx, &flags);
+	if (!rec)
+		return;
 
-	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->exit_event->id;
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
-
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(flags);
+	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
 }
 
 int prof_sysexit_enable(struct ftrace_event_call *call)
-- 
cgit v1.1


From 1e12a4a7a3a78bc9c3aaf3486dde3b8ab1cdf465 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 28 Jan 2010 09:34:27 +0800
Subject: tracing/kprobe: Cleanup unused return value of tracing functions

The return values of the kprobe's tracing functions are meaningless,
lets remove these.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4B60E9A3.2040505@cn.fujitsu.com>
[fweisbec@gmail: whitespace fixes, drop useless void returns in end
of functions]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_kprobe.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2e28ee3..6178abf 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
 };
 
 /* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct kprobe_trace_entry *entry;
@@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
 						  irq_flags, pc);
 	if (!event)
-		return 0;
+		return;
 
 	entry = ring_buffer_event_data(event);
 	entry->nargs = tp->nr_args;
@@ -972,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-	return 0;
 }
 
 /* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 					  struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -995,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
 	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
 						  irq_flags, pc);
 	if (!event)
-		return 0;
+		return;
 
 	entry = ring_buffer_event_data(event);
 	entry->nargs = tp->nr_args;
@@ -1006,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-
-	return 0;
 }
 
 /* Event entry printers */
@@ -1237,7 +1234,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_profile_func(struct kprobe *kp,
 					 struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1252,11 +1249,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
 		     "profile buffer not large enough"))
-		return 0;
+		return;
 
 	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
 	if (!entry)
-		return 0;
+		return;
 
 	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
@@ -1264,12 +1261,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
 	ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
-
-	return 0;
 }
 
 /* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
 					    struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1284,11 +1279,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
 		     "profile buffer not large enough"))
-		return 0;
+		return;
 
 	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
 	if (!entry)
-		return 0;
+		return;
 
 	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
@@ -1297,8 +1292,6 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
 	ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
-
-	return 0;
 }
 
 static int probe_profile_enable(struct ftrace_event_call *call)
-- 
cgit v1.1


From 75c9f3284a7ff957829f44baace82406a6354ceb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 Jan 2010 09:04:26 +0100
Subject: perf_events: Fix sample_period transfer on inherit

One problem with frequency driven counters is that we cannot
predict the rate at which they trigger, therefore we have to
start them at period=1, this causes a ramp up effect. However,
if we fail to propagate the stable state on fork each new child
will have to ramp up again. This can lead to significant
artifacts in sample data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: eranian@google.com
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1264752266.4283.2121.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 251fb95..53dc2a3 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5002,8 +5002,15 @@ inherit_event(struct perf_event *parent_event,
 	else
 		child_event->state = PERF_EVENT_STATE_OFF;
 
-	if (parent_event->attr.freq)
-		child_event->hw.sample_period = parent_event->hw.sample_period;
+	if (parent_event->attr.freq) {
+		u64 sample_period = parent_event->hw.sample_period;
+		struct hw_perf_event *hwc = &child_event->hw;
+
+		hwc->sample_period = sample_period;
+		hwc->last_period   = sample_period;
+
+		atomic64_set(&hwc->period_left, sample_period);
+	}
 
 	child_event->overflow_handler = parent_event->overflow_handler;
 
-- 
cgit v1.1


From 615d0ebbc782b67296e3226c293f520f93f93515 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 2 Feb 2010 16:49:04 -0500
Subject: kprobes: Disable booster when CONFIG_PREEMPT=y

Disable kprobe booster when CONFIG_PREEMPT=y at this time,
because it can't ensure that all kernel threads preempted on
kprobe's boosted slot run out from the slot even using
freeze_processes().

The booster on preemptive kernel will be resumed if
synchronize_tasks() or something like that is introduced.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20100202214904.4694.24330.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302..9907a03 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -124,30 +124,6 @@ static LIST_HEAD(kprobe_insn_pages);
 static int kprobe_garbage_slots;
 static int collect_garbage_slots(void);
 
-static int __kprobes check_safety(void)
-{
-	int ret = 0;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
-	ret = freeze_processes();
-	if (ret == 0) {
-		struct task_struct *p, *q;
-		do_each_thread(p, q) {
-			if (p != current && p->state == TASK_RUNNING &&
-			    p->pid != 0) {
-				printk("Check failed: %s is running\n",p->comm);
-				ret = -1;
-				goto loop_end;
-			}
-		} while_each_thread(p, q);
-	}
-loop_end:
-	thaw_processes();
-#else
-	synchronize_sched();
-#endif
-	return ret;
-}
-
 /**
  * __get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
@@ -235,9 +211,8 @@ static int __kprobes collect_garbage_slots(void)
 {
 	struct kprobe_insn_page *kip, *next;
 
-	/* Ensure no-one is preepmted on the garbages */
-	if (check_safety())
-		return -EAGAIN;
+	/* Ensure no-one is interrupted on the garbages */
+	synchronize_sched();
 
 	list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
 		int i;
-- 
cgit v1.1


From 2cfa19780d61740f65790c5bae363b759d7c96fa Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 2 Feb 2010 16:49:11 -0500
Subject: ftrace/alternatives: Introducing *_text_reserved functions

Introducing *_text_reserved functions for checking the text
address range is partially reserved or not. This patch provides
checking routines for x86 smp alternatives and dynamic ftrace.
Since both functions modify fixed pieces of kernel text, they
should reserve and protect those from other dynamic text
modifier, like kprobes.

This will also be extended when introducing other subsystems
which modify fixed pieces of kernel text. Dynamic text modifiers
should avoid those.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: przemyslaw@pawelczyk.it
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <20100202214911.4694.16587.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f..3d90661 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1025,6 +1025,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 }
 
 
+/* Return 1 if the address range is reserved for ftrace */
+int ftrace_text_reserved(void *start, void *end)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		if (rec->ip <= (unsigned long)end &&
+		    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
+			return 1;
+	} while_for_each_ftrace_rec();
+	return 0;
+}
+
+
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
-- 
cgit v1.1


From 4554dbcb85a4ed2abaa2b6fa15649b796699ec89 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 2 Feb 2010 16:49:18 -0500
Subject: kprobes: Check probe address is reserved

Check whether the address of new probe is already reserved by
ftrace or alternatives (on x86) when registering new probe.
If reserved, it returns an error and not register the probe.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: przemyslaw@pawelczyk.it
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <20100202214918.4694.94179.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9907a03..c3340e8 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
 #include <linux/debugfs.h>
 #include <linux/kdebug.h>
 #include <linux/memory.h>
+#include <linux/ftrace.h>
 
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -703,7 +704,8 @@ int __kprobes register_kprobe(struct kprobe *p)
 
 	preempt_disable();
 	if (!kernel_text_address((unsigned long) p->addr) ||
-	    in_kprobes_functions((unsigned long) p->addr)) {
+	    in_kprobes_functions((unsigned long) p->addr) ||
+	    ftrace_text_reserved(p->addr, p->addr)) {
 		preempt_enable();
 		return -EINVAL;
 	}
-- 
cgit v1.1


From f24bb999d2b9f2950e5cac5b69bffedf73c24ea4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 2 Feb 2010 16:49:25 -0500
Subject: ftrace: Remove record freezing

Remove record freezing. Because kprobes never puts probe on
ftrace's mcount call anymore, it doesn't need ftrace to check
whether kprobes on it.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: przemyslaw@pawelczyk.it
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100202214925.4694.73469.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3d90661..1904797 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
 		}				\
 	}
 
-#ifdef CONFIG_KPROBES
-
-static int frozen_record_count;
-
-static inline void freeze_record(struct dyn_ftrace *rec)
-{
-	if (!(rec->flags & FTRACE_FL_FROZEN)) {
-		rec->flags |= FTRACE_FL_FROZEN;
-		frozen_record_count++;
-	}
-}
-
-static inline void unfreeze_record(struct dyn_ftrace *rec)
-{
-	if (rec->flags & FTRACE_FL_FROZEN) {
-		rec->flags &= ~FTRACE_FL_FROZEN;
-		frozen_record_count--;
-	}
-}
-
-static inline int record_frozen(struct dyn_ftrace *rec)
-{
-	return rec->flags & FTRACE_FL_FROZEN;
-}
-#else
-# define freeze_record(rec)			({ 0; })
-# define unfreeze_record(rec)			({ 0; })
-# define record_frozen(rec)			({ 0; })
-#endif /* CONFIG_KPROBES */
-
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
 	rec->freelist = ftrace_free_records;
@@ -1091,14 +1060,6 @@ static void ftrace_replace_code(int enable)
 		    !(rec->flags & FTRACE_FL_CONVERTED))
 			continue;
 
-		/* ignore updates to this record's mcount site */
-		if (get_kprobe((void *)rec->ip)) {
-			freeze_record(rec);
-			continue;
-		} else {
-			unfreeze_record(rec);
-		}
-
 		failed = __ftrace_replace_code(rec, enable);
 		if (failed) {
 			rec->flags |= FTRACE_FL_FAILED;
-- 
cgit v1.1


From 9717e6cd3db22eade7dbae0fc9235c66325a7132 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 28 Jan 2010 13:57:44 +0100
Subject: perf_events: Optimize perf_event_task_tick()

Pretty much all of the calls do perf_disable/perf_enable cycles, pull
that out to cut back on hardware programming.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 40f8b07..087025f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1573,12 +1573,8 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	raw_spin_lock(&ctx->lock);
 
 	/* Rotate the first entry last of non-pinned groups */
-	perf_disable();
-
 	list_rotate_left(&ctx->flexible_groups);
 
-	perf_enable();
-
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1593,6 +1589,8 @@ void perf_event_task_tick(struct task_struct *curr)
 	cpuctx = &__get_cpu_var(perf_cpu_context);
 	ctx = curr->perf_event_ctxp;
 
+	perf_disable();
+
 	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
 		perf_ctx_adjust_freq(ctx);
@@ -1608,6 +1606,8 @@ void perf_event_task_tick(struct task_struct *curr)
 	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
 		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+
+	perf_enable();
 }
 
 static int event_enable_on_exec(struct perf_event *event,
-- 
cgit v1.1


From 5ecaafdbf44b1ba400b746c60c401d54c7ee0863 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 5 Feb 2010 01:24:34 -0500
Subject: kprobes: Add mcount to the kprobes blacklist

Since mcount function can be called from everywhere,
it should be blacklisted. Moreover, the "mcount" symbol
is a special symbol name. So, it is better to put it in
the generic blacklist.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20100205062433.3745.36726.stgit@dhcp-100-2-132.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c3340e8..ccec774 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -94,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 	{"native_get_debugreg",},
 	{"irq_entries_start",},
 	{"common_interrupt",},
+	{"mcount",},	/* mcount can be called from everywhere */
 	{NULL}    /* Terminator */
 };
 
-- 
cgit v1.1


From 3a0304e90aa5a2c0c308a05d28f7d109a48d8539 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Feb 2010 10:33:41 +0100
Subject: perf_events: Report the MMAP pgoff value in bytes

DaveM reported that currently perf interprets the pgoff value reported by
the MMAP events as a byte range, but the kernel reports it as a page
offset.

Since its broken (and unusable) anyway, change the kernel behaviour (ABI)
to report bytes indeed, avoiding the need for userspace to deal with
PAGE_SIZE things.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 087025f..5a69abb 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3749,7 +3749,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
 			/* .tid */
 			.start  = vma->vm_start,
 			.len    = vma->vm_end - vma->vm_start,
-			.pgoff  = vma->vm_pgoff,
+			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
 		},
 	};
 
-- 
cgit v1.1


From d76a0812ac4139ceb54daab3cc70e1bd8bd9d43a Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 8 Feb 2010 17:06:01 +0200
Subject: perf_events: Add new start/stop PMU callbacks

In certain situations, the kernel may need to stop and start the same
event rapidly. The current PMU callbacks do not distinguish between stop
and release (i.e., stop + free the resource). Thus, a counter may be
released, then it will be immediately re-acquired. Event scheduling will
again take place with no guarantee to assign the same counter. On some
processors, this may event yield to failure to assign the event back due
to competion between cores.

This patch is adding a new pair of callback to stop and restart a counter
without actually release the underlying counter resource. On stop, the
counter is stopped, its values saved and that's it. On start, the value
is reloaded and counter is restarted (on x86, actual restart is delayed
until perf_enable()).

Signed-off-by: Stephane Eranian <eranian@google.com>
[ added fallback to ->enable/->disable for all other PMUs
  fixed x86_pmu_start() to call x86_pmu.enable()
  merged __x86_pmu_disable into x86_pmu_stop() ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4b703875.0a04d00a.7896.ffffb824@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 5a69abb..74c6002 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1493,6 +1493,22 @@ do {					\
 	return div64_u64(dividend, divisor);
 }
 
+static void perf_event_stop(struct perf_event *event)
+{
+	if (!event->pmu->stop)
+		return event->pmu->disable(event);
+
+	return event->pmu->stop(event);
+}
+
+static int perf_event_start(struct perf_event *event)
+{
+	if (!event->pmu->start)
+		return event->pmu->enable(event);
+
+	return event->pmu->start(event);
+}
+
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -1513,9 +1529,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 
 	if (atomic64_read(&hwc->period_left) > 8*sample_period) {
 		perf_disable();
-		event->pmu->disable(event);
+		perf_event_stop(event);
 		atomic64_set(&hwc->period_left, 0);
-		event->pmu->enable(event);
+		perf_event_start(event);
 		perf_enable();
 	}
 }
-- 
cgit v1.1


From 38331f62c20456454eed9ebea2525f072c6f1d2e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 8 Feb 2010 17:17:01 +0200
Subject: perf_events, x86: AMD event scheduling

This patch adds correct AMD NorthBridge event scheduling.

NB events are events measuring L3 cache, Hypertransport traffic. They are
identified by an event code >= 0xe0. They measure events on the
Northbride which is shared by all cores on a package. NB events are
counted on a shared set of counters. When a NB event is programmed in a
counter, the data actually comes from a shared counter. Thus, access to
those counters needs to be synchronized.

We implement the synchronization such that no two cores can be measuring
NB events using the same counters. Thus, we maintain a per-NB allocation
table. The available slot is propagated using the event_constraint
structure.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4b703957.0702d00a.6bf2.7b7d@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 74c6002..fb4e56e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,6 +98,7 @@ void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
 void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
+void __weak hw_perf_event_setup_offline(int cpu)	{ barrier(); }
 
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
@@ -5462,6 +5463,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 		perf_event_exit_cpu(cpu);
 		break;
 
+	case CPU_DEAD:
+		hw_perf_event_setup_offline(cpu);
+		break;
+
 	default:
 		break;
 	}
-- 
cgit v1.1


From 6e37738a2fac964583debe91099bc3248554f6e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Feb 2010 13:21:58 +0100
Subject: perf_events: Simplify code by removing cpu argument to
 hw_perf_group_sched_in()

Since the cpu argument to hw_perf_group_sched_in() is always
smp_processor_id(), simplify the code a little by removing this argument
and using the current cpu where needed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1265890918.5396.3.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 45 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fb4e56e..05b6c6b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -103,7 +103,7 @@ void __weak hw_perf_event_setup_offline(int cpu)	{ barrier(); }
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
 	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx, int cpu)
+	       struct perf_event_context *ctx)
 {
 	return 0;
 }
@@ -633,14 +633,13 @@ void perf_event_disable(struct perf_event *event)
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
-		 struct perf_event_context *ctx,
-		 int cpu)
+		 struct perf_event_context *ctx)
 {
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
 	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+	event->oncpu = smp_processor_id();
 	/*
 	 * The new state must be visible before we turn it on in the hardware:
 	 */
@@ -667,8 +666,7 @@ event_sched_in(struct perf_event *event,
 static int
 group_sched_in(struct perf_event *group_event,
 	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx,
-	       int cpu)
+	       struct perf_event_context *ctx)
 {
 	struct perf_event *event, *partial_group;
 	int ret;
@@ -676,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
 
-	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
 	if (ret)
 		return ret < 0 ? ret : 0;
 
-	if (event_sched_in(group_event, cpuctx, ctx, cpu))
+	if (event_sched_in(group_event, cpuctx, ctx))
 		return -EAGAIN;
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-		if (event_sched_in(event, cpuctx, ctx, cpu)) {
+		if (event_sched_in(event, cpuctx, ctx)) {
 			partial_group = event;
 			goto group_error;
 		}
@@ -761,7 +759,6 @@ static void __perf_install_in_context(void *info)
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
-	int cpu = smp_processor_id();
 	int err;
 
 	/*
@@ -808,7 +805,7 @@ static void __perf_install_in_context(void *info)
 	if (!group_can_go_on(event, cpuctx, 1))
 		err = -EEXIST;
 	else
-		err = event_sched_in(event, cpuctx, ctx, cpu);
+		err = event_sched_in(event, cpuctx, ctx);
 
 	if (err) {
 		/*
@@ -950,11 +947,9 @@ static void __perf_event_enable(void *info)
 	} else {
 		perf_disable();
 		if (event == leader)
-			err = group_sched_in(event, cpuctx, ctx,
-					     smp_processor_id());
+			err = group_sched_in(event, cpuctx, ctx);
 		else
-			err = event_sched_in(event, cpuctx, ctx,
-					       smp_processor_id());
+			err = event_sched_in(event, cpuctx, ctx);
 		perf_enable();
 	}
 
@@ -1281,19 +1276,18 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 
 static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx,
-		    int cpu)
+		    struct perf_cpu_context *cpuctx)
 {
 	struct perf_event *event;
 
 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-		if (event->cpu != -1 && event->cpu != cpu)
+		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
 		if (group_can_go_on(event, cpuctx, 1))
-			group_sched_in(event, cpuctx, ctx, cpu);
+			group_sched_in(event, cpuctx, ctx);
 
 		/*
 		 * If this pinned group hasn't been scheduled,
@@ -1308,8 +1302,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx,
-		      int cpu)
+		      struct perf_cpu_context *cpuctx)
 {
 	struct perf_event *event;
 	int can_add_hw = 1;
@@ -1322,11 +1315,11 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
 		 */
-		if (event->cpu != -1 && event->cpu != cpu)
+		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
 		if (group_can_go_on(event, cpuctx, can_add_hw))
-			if (group_sched_in(event, cpuctx, ctx, cpu))
+			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
 	}
 }
@@ -1336,8 +1329,6 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type)
 {
-	int cpu = smp_processor_id();
-
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	if (likely(!ctx->nr_events))
@@ -1352,11 +1343,11 @@ ctx_sched_in(struct perf_event_context *ctx,
 	 * in order to give them the best chance of going on.
 	 */
 	if (event_type & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx, cpu);
+		ctx_pinned_sched_in(ctx, cpuctx);
 
 	/* Then walk through the lower prio flexible groups */
 	if (event_type & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx, cpu);
+		ctx_flexible_sched_in(ctx, cpuctx);
 
 	perf_enable();
  out:
-- 
cgit v1.1


From 24691ea964cc0123e386b661e03a86a481c6ee79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Feb 2010 16:36:23 +0100
Subject: perf_event: Fix preempt warning in perf_clock()

A recent commit introduced a preemption warning for
perf_clock(), use raw_smp_processor_id() to avoid this, it
really doesn't matter which cpu we use here.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1267198583.22519.684.camel@laptop>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 05b6c6b..aa6155b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -249,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 
 static inline u64 perf_clock(void)
 {
-	return cpu_clock(smp_processor_id());
+	return cpu_clock(raw_smp_processor_id());
 }
 
 /*
-- 
cgit v1.1