From 49f474331e563a6ecf3b1e87ec27ec5482b3e4f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 27 Dec 2009 11:51:52 +0100 Subject: perf events: Remove arg from perf sched hooks Since we only ever schedule the local cpu, there is no need to pass the cpu number to the perf sched hooks. This micro-optimizes things a bit. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061..099bd66 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1170,9 +1170,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, * not restart the event. */ void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; @@ -1252,8 +1252,9 @@ static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) static void __perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { + int cpu = smp_processor_id(); struct perf_event *event; int can_add_hw = 1; @@ -1326,24 +1327,24 @@ __perf_event_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; if (cpuctx->task_ctx == ctx) return; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); cpuctx->task_ctx = ctx; } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); } #define MAX_INTERRUPTS (~0ULL) @@ -1461,7 +1462,7 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; @@ -1469,7 +1470,7 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (!atomic_read(&nr_events)) return; - cpuctx = &per_cpu(perf_cpu_context, cpu); + cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; perf_ctx_adjust_freq(&cpuctx->ctx); @@ -1484,9 +1485,9 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx, cpu); + perf_event_cpu_sched_in(cpuctx); if (ctx) - perf_event_task_sched_in(curr, cpu); + perf_event_task_sched_in(curr); } /* @@ -1527,7 +1528,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task, smp_processor_id()); + perf_event_task_sched_in(task); out: local_irq_restore(flags); } -- cgit v1.1 From 07b139c8c81b97bbe55c68daf0cbeca8b1c609ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 21 Dec 2009 14:27:35 +0800 Subject: perf events: Remove CONFIG_EVENT_PROFILE Quoted from Ingo: | This reminds me - i think we should eliminate CONFIG_EVENT_PROFILE - | it's an unnecessary Kconfig complication. If both PERF_EVENTS and | EVENT_TRACING is enabled we should expose generic tracepoints. | | Nor is it limited to event 'profiling', so it has become a misnomer as | well. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B2F1557.2050705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 099bd66..5b987b4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4177,7 +4177,7 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) @@ -4282,7 +4282,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT static void bp_perf_event_destroy(struct perf_event *event) -- cgit v1.1 From 889ff0150661512d79484219612b7e2e024b6c07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 20:04:47 +0100 Subject: perf/core: Split context's event group list into pinned and non-pinned lists Split-up struct perf_event_context::group_list into pinned_groups and flexible_groups (non-pinned). This first appears to be useless as it duplicates various loops around the group list handlings. But it scales better in the fast-path in perf_sched_in(). We don't anymore iterate twice through the entire list to separate pinned and non-pinned scheduling. Instead we interate through two distinct lists. The another desired effect is that it makes easier to define distinct scheduling rules on both. Changes in v2: - Respectively rename pinned_grp_list and volatile_grp_list into pinned_groups and flexible_groups as per Ingo suggestion. - Various cleanups Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 227 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 151 insertions(+), 76 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 27f69a04..c9f8a75 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -289,6 +289,15 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -303,9 +312,12 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * add it straight to the context's event list, or to the group * leader's sibling list: */ - if (group_leader == event) - list_add_tail(&event->group_entry, &ctx->group_list); - else { + if (group_leader == event) { + struct list_head *list; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } else { list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -355,8 +367,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * to the context list directly: */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + struct list_head *list; - list_move_tail(&sibling->group_entry, &ctx->group_list); + list = ctx_group_list(event, ctx); + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; } } @@ -1056,7 +1070,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } perf_enable(); @@ -1271,9 +1288,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF || - !event->attr.pinned) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) continue; if (event->cpu != -1 && event->cpu != cpu) continue; @@ -1291,15 +1307,10 @@ __perf_event_sched_in(struct perf_event_context *ctx, } } - list_for_each_entry(event, &ctx->group_list, group_entry) { - /* - * Ignore events in OFF or ERROR state, and - * ignore pinned events since we did them already. - */ - if (event->state <= PERF_EVENT_STATE_OFF || - event->attr.pinned) + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) continue; - /* * Listen to the 'cpu' scheduling filter constraint * of events: @@ -1453,8 +1464,13 @@ static void rotate_ctx(struct perf_event_context *ctx) * Rotate the first entry last (works just fine for group events too): */ perf_disable(); - list_for_each_entry(event, &ctx->group_list, group_entry) { - list_move_tail(&event->group_entry, &ctx->group_list); + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->pinned_groups); + break; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->flexible_groups); break; } perf_enable(); @@ -1490,6 +1506,21 @@ void perf_event_task_tick(struct task_struct *curr) perf_event_task_sched_in(curr); } +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; +} + /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. @@ -1500,6 +1531,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) struct perf_event *event; unsigned long flags; int enabled = 0; + int ret; local_irq_save(flags); ctx = task->perf_event_ctxp; @@ -1510,14 +1542,16 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (!event->attr.enable_on_exec) - continue; - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - continue; - __perf_event_mark_enabled(event, ctx); - enabled = 1; + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; } /* @@ -1591,7 +1625,8 @@ __perf_event_init_context(struct perf_event_context *ctx, { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->pinned_groups); + INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); ctx->task = task; @@ -5032,7 +5067,11 @@ void perf_event_exit_task(struct task_struct *child) mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, group_entry) __perf_event_exit_task(child_event, child_ctx, child); @@ -5041,7 +5080,8 @@ again: * its siblings to the list, but we obtained 'tmp' before that which * will still point to the list head terminating the iteration. */ - if (!list_empty(&child_ctx->group_list)) + if (!list_empty(&child_ctx->pinned_groups) || + !list_empty(&child_ctx->flexible_groups)) goto again; mutex_unlock(&child_ctx->mutex); @@ -5049,6 +5089,24 @@ again: put_ctx(child_ctx); } +static void perf_free_event(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + return; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); +} + /* * free an unexposed, unused context as created by inheritance by * init_task below, used by fork() in case of fail. @@ -5063,36 +5121,70 @@ void perf_event_free_task(struct task_struct *task) mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { - struct perf_event *parent = event->parent; + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + perf_free_event(event, ctx); - if (WARN_ON_ONCE(!parent)) - continue; + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - fput(parent->filp); + mutex_unlock(&ctx->mutex); - list_del_event(event, ctx); - free_event(event); + put_ctx(ctx); +} + +static int +inherit_task_group(struct perf_event *event, struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + int *inherited_all) +{ + int ret; + struct perf_event_context *child_ctx = child->perf_event_ctxp; + + if (!event->attr.inherit) { + *inherited_all = 0; + return 0; } - if (!list_empty(&ctx->group_list)) - goto again; + if (!child_ctx) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ - mutex_unlock(&ctx->mutex); + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; - put_ctx(ctx); + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + + if (ret) + *inherited_all = 0; + + return ret; } + /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx = NULL, *parent_ctx; + struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5130,41 +5222,22 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->group_list, group_entry) { - - if (!event->attr.inherit) { - inherited_all = 0; - continue; - } - - if (!child->perf_event_ctxp) { - /* - * This is executed from the parent task context, so - * inherit events that have been marked for cloning. - * First allocate and initialize a context for the - * child. - */ - - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); - if (!child_ctx) { - ret = -ENOMEM; - break; - } - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - } + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) + break; + } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) break; - } } + child_ctx = child->perf_event_ctxp; + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent @@ -5213,7 +5286,9 @@ static void __perf_event_exit_cpu(void *info) struct perf_event_context *ctx = &cpuctx->ctx; struct perf_event *event, *tmp; - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + __perf_event_remove_from_context(event); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } static void perf_event_exit_cpu(int cpu) -- cgit v1.1 From e286417378b4f9ce6e473b556193465ab22e12ab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 21:05:28 +0100 Subject: perf: Round robin flexible groups of events using list_rotate_left() This is more proper that doing it through a list_for_each_entry() that breaks after the first entry. v2: Don't rotate pinned groups as its not needed to time share them. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c9f8a75..bbebe28 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1454,25 +1454,16 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ static void rotate_ctx(struct perf_event_context *ctx) { - struct perf_event *event; - if (!ctx->nr_events) return; raw_spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group events too): - */ + + /* Rotate the first entry last of non-pinned groups */ perf_disable(); - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - list_move_tail(&event->group_entry, &ctx->pinned_groups); - break; - } - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - list_move_tail(&event->group_entry, &ctx->flexible_groups); - break; - } + list_rotate_left(&ctx->flexible_groups); + perf_enable(); raw_spin_unlock(&ctx->lock); -- cgit v1.1 From d6f962b57bfaab62891c7abbf1469212a56d6103 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 10 Jan 2010 01:25:51 +0100 Subject: perf: Export software-only event group characteristic as a flag Before scheduling an event group, we first check if a group can go on. We first check if the group is made of software only events first, in which case it is enough to know if the group can be scheduled in. For that purpose, we iterate through the whole group, which is wasteful as we could do this check when we add/delete an event to a group. So we create a group_flags field in perf event that can host characteristics from a group of events, starting with a first PERF_GROUP_SOFTWARE flag that reduces the check on the fast path. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bbebe28..eae6ff6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -315,9 +315,16 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (group_leader == event) { struct list_head *list; + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); } else { + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -372,6 +379,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; } } @@ -700,24 +710,6 @@ group_error: } /* - * Return 1 for a group consisting entirely of software events, - * 0 if the group contains any hardware events. - */ -static int is_software_only_group(struct perf_event *leader) -{ - struct perf_event *event; - - if (!is_software_event(leader)) - return 0; - - list_for_each_entry(event, &leader->sibling_list, group_entry) - if (!is_software_event(event)) - return 0; - - return 1; -} - -/* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, @@ -727,7 +719,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (is_software_only_group(event)) + if (event->group_flags & PERF_GROUP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware -- cgit v1.1 From 42cce92f4ddfa41e2dfe26fdcad4887943c032f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 10:36:08 +0100 Subject: perf: Make __perf_event_sched_out static __perf_event_sched_out doesn't need to be globally available, make it static. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index eae6ff6..c4e90b8 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1049,8 +1049,8 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -void __perf_event_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) { struct perf_event *event; -- cgit v1.1 From 5b0311e1f2464547fc6f17a82d7ea2538c8c7a70 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 11:59:13 +0100 Subject: perf: Allow pinned and flexible groups to be scheduled separately Tune the scheduling helpers so that we can choose to schedule either pinned and/or flexible groups from a context. And while at it, refactor a bit the naming of these helpers to make these more consistent and flexible. There is no (intended) change in scheduling behaviour in this patch. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 137 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 93 insertions(+), 44 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c4e90b8..bfc4ee0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1049,8 +1049,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -static void __perf_event_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) { struct perf_event *event; @@ -1061,13 +1068,18 @@ static void __perf_event_sched_out(struct perf_event_context *ctx, update_context_time(ctx); perf_disable(); - if (ctx->nr_active) { + if (!ctx->nr_active) + goto out_enable; + + if (event_type & EVENT_PINNED) list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); + if (event_type & EVENT_FLEXIBLE) list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); - } + + out_enable: perf_enable(); out: raw_spin_unlock(&ctx->lock); @@ -1229,15 +1241,13 @@ void perf_event_task_sched_out(struct task_struct *task, rcu_read_unlock(); if (do_switch) { - __perf_event_sched_out(ctx, cpuctx); + ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; } } -/* - * Called with IRQs disabled - */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); @@ -1247,39 +1257,34 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx) if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - __perf_event_sched_out(ctx, cpuctx); + ctx_sched_out(ctx, cpuctx, event_type); cpuctx->task_ctx = NULL; } /* * Called with IRQs disabled */ -static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +static void __perf_event_task_sched_out(struct perf_event_context *ctx) { - __perf_event_sched_out(&cpuctx->ctx, cpuctx); + task_ctx_sched_out(ctx, EVENT_ALL); +} + +/* + * Called with IRQs disabled + */ +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } static void -__perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + int cpu) { - int cpu = smp_processor_id(); struct perf_event *event; - int can_add_hw = 1; - - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_events)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ list_for_each_entry(event, &ctx->pinned_groups, group_entry) { if (event->state <= PERF_EVENT_STATE_OFF) continue; @@ -1298,6 +1303,15 @@ __perf_event_sched_in(struct perf_event_context *ctx, event->state = PERF_EVENT_STATE_ERROR; } } +} + +static void +ctx_flexible_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; list_for_each_entry(event, &ctx->flexible_groups, group_entry) { /* Ignore events in OFF or ERROR state */ @@ -1314,11 +1328,53 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; } +} + +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + int cpu = smp_processor_id(); + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + if (event_type & EVENT_PINNED) + ctx_pinned_sched_in(ctx, cpuctx, cpu); + + /* Then walk through the lower prio flexible groups */ + if (event_type & EVENT_FLEXIBLE) + ctx_flexible_sched_in(ctx, cpuctx, cpu); + perf_enable(); out: raw_spin_unlock(&ctx->lock); } +static void task_ctx_sched_in(struct task_struct *task, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + ctx_sched_in(ctx, cpuctx, event_type); + cpuctx->task_ctx = ctx; +} /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -1332,22 +1388,15 @@ __perf_event_sched_in(struct perf_event_context *ctx, */ void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_event_sched_in(ctx, cpuctx); - cpuctx->task_ctx = ctx; + task_ctx_sched_in(task, EVENT_ALL); } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx) +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) { struct perf_event_context *ctx = &cpuctx->ctx; - __perf_event_sched_in(ctx, cpuctx); + ctx_sched_in(ctx, cpuctx, event_type); } #define MAX_INTERRUPTS (~0ULL) @@ -1476,17 +1525,17 @@ void perf_event_task_tick(struct task_struct *curr) if (ctx) perf_ctx_adjust_freq(ctx); - perf_event_cpu_sched_out(cpuctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); if (ctx) - __perf_event_task_sched_out(ctx); + task_ctx_sched_out(ctx, EVENT_ALL); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx); + cpu_ctx_sched_in(cpuctx, EVENT_ALL); if (ctx) - perf_event_task_sched_in(curr); + task_ctx_sched_in(curr, EVENT_ALL); } static int event_enable_on_exec(struct perf_event *event, -- cgit v1.1 From 7defb0f879bbcfe29e3c6f29d685d4f29b7a0700 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 12:15:31 +0100 Subject: perf: Don't schedule out/in pinned events on task tick We don't need to schedule in/out pinned events on task tick, now that pinned and flexible groups can be scheduled separately. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bfc4ee0..a90ae69 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1525,17 +1525,17 @@ void perf_event_task_tick(struct task_struct *curr) if (ctx) perf_ctx_adjust_freq(ctx); - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_in(curr, EVENT_ALL); + task_ctx_sched_in(curr, EVENT_FLEXIBLE); } static int event_enable_on_exec(struct perf_event *event, -- cgit v1.1 From 329c0e012b99fa2325a0be205c052e4aba690f16 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 12:56:05 +0100 Subject: perf: Better order flexible and pinned scheduling When a task gets scheduled in. We don't touch the cpu bound events so the priority order becomes: cpu pinned, cpu flexible, task pinned, task flexible. So schedule out cpu flexibles when a new task context gets in and correctly order the groups to schedule in: task pinned, cpu flexible, task flexible. Cpu pinned groups don't need to be touched at this time. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a90ae69..edc46b9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1362,6 +1362,14 @@ ctx_sched_in(struct perf_event_context *ctx, raw_spin_unlock(&ctx->lock); } +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + ctx_sched_in(ctx, cpuctx, event_type); +} + static void task_ctx_sched_in(struct task_struct *task, enum event_type_t event_type) { @@ -1388,15 +1396,27 @@ static void task_ctx_sched_in(struct task_struct *task, */ void perf_event_task_sched_in(struct task_struct *task) { - task_ctx_sched_in(task, EVENT_ALL); -} + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = task->perf_event_ctxp; -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - struct perf_event_context *ctx = &cpuctx->ctx; + if (likely(!ctx)) + return; - ctx_sched_in(ctx, cpuctx, event_type); + if (cpuctx->task_ctx == ctx) + return; + + /* + * We want to keep the following priority order: + * cpu pinned (that don't need to move), task pinned, + * cpu flexible, task flexible. + */ + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + + ctx_sched_in(ctx, cpuctx, EVENT_PINNED); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + + cpuctx->task_ctx = ctx; } #define MAX_INTERRUPTS (~0ULL) -- cgit v1.1 From abd50713944c8ea9e0af5b7bffa0aacae21cc91a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2010 18:50:16 +0100 Subject: perf: Reimplement frequency driven sampling There was a bug in the old period code that caused intel_pmu_enable_all() or native_write_msr_safe() to show up quite high in the profiles. In staring at that code it made my head hurt, so I rewrote it in a hopefully simpler fashion. Its now fully symetric between tick and overflow driven adjustments and uses less data to boot. The only complication is that it basically wants to do a u128 division. The code approximates that in a rather simple truncate until it fits fashion, taking care to balance the terms while truncating. This version does not generate that sampling artefact. Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 132 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 40 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index edc46b9..251fb95 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1423,14 +1423,83 @@ void perf_event_task_sched_in(struct task_struct *task) static void perf_log_throttle(struct perf_event *event, int enable); -static void perf_adjust_period(struct perf_event *event, u64 events) +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) +{ + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } + + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + return div64_u64(dividend, divisor); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; u64 period, sample_period; s64 delta; - events *= hwc->sample_period; - period = div64_u64(events, event->attr.sample_freq); + period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ @@ -1441,13 +1510,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events) sample_period = 1; hwc->sample_period = sample_period; + + if (atomic64_read(&hwc->period_left) > 8*sample_period) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } } static void perf_ctx_adjust_freq(struct perf_event_context *ctx) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, freq; + u64 interrupts, now; + s64 delta; raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { @@ -1468,44 +1546,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); event->pmu->unthrottle(event); - interrupts = 2*sysctl_perf_event_sample_rate/HZ; } if (!event->attr.freq || !event->attr.sample_freq) continue; - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (event->attr.sample_freq < HZ) { - freq = event->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(event, freq * interrupts); + event->pmu->read(event); + now = atomic64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - event->pmu->disable(event); - atomic64_set(&hwc->period_left, 0); - event->pmu->enable(event); - perf_enable(); - } + if (delta > 0) + perf_adjust_period(event, TICK_NSEC, delta); } raw_spin_unlock(&ctx->lock); } @@ -3768,12 +3820,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (event->attr.freq) { u64 now = perf_clock(); - s64 delta = now - hwc->freq_stamp; + s64 delta = now - hwc->freq_time_stamp; - hwc->freq_stamp = now; + hwc->freq_time_stamp = now; - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + if (delta > 0 && delta < 2*TICK_NSEC) + perf_adjust_period(event, delta, hwc->last_period); } /* -- cgit v1.1 From 75c9f3284a7ff957829f44baace82406a6354ceb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2010 09:04:26 +0100 Subject: perf_events: Fix sample_period transfer on inherit One problem with frequency driven counters is that we cannot predict the rate at which they trigger, therefore we have to start them at period=1, this causes a ramp up effect. However, if we fail to propagate the stable state on fork each new child will have to ramp up again. This can lead to significant artifacts in sample data. Signed-off-by: Peter Zijlstra Cc: eranian@google.com Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <1264752266.4283.2121.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 251fb95..53dc2a3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5002,8 +5002,15 @@ inherit_event(struct perf_event *parent_event, else child_event->state = PERF_EVENT_STATE_OFF; - if (parent_event->attr.freq) - child_event->hw.sample_period = parent_event->hw.sample_period; + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + atomic64_set(&hwc->period_left, sample_period); + } child_event->overflow_handler = parent_event->overflow_handler; -- cgit v1.1 From 9717e6cd3db22eade7dbae0fc9235c66325a7132 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Jan 2010 13:57:44 +0100 Subject: perf_events: Optimize perf_event_task_tick() Pretty much all of the calls do perf_disable/perf_enable cycles, pull that out to cut back on hardware programming. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 40f8b07..087025f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1573,12 +1573,8 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_lock(&ctx->lock); /* Rotate the first entry last of non-pinned groups */ - perf_disable(); - list_rotate_left(&ctx->flexible_groups); - perf_enable(); - raw_spin_unlock(&ctx->lock); } @@ -1593,6 +1589,8 @@ void perf_event_task_tick(struct task_struct *curr) cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; + perf_disable(); + perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); @@ -1608,6 +1606,8 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_in(curr, EVENT_FLEXIBLE); + + perf_enable(); } static int event_enable_on_exec(struct perf_event *event, -- cgit v1.1 From 3a0304e90aa5a2c0c308a05d28f7d109a48d8539 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2010 10:33:41 +0100 Subject: perf_events: Report the MMAP pgoff value in bytes DaveM reported that currently perf interprets the pgoff value reported by the MMAP events as a byte range, but the kernel reports it as a page offset. Since its broken (and unusable) anyway, change the kernel behaviour (ABI) to report bytes indeed, avoiding the need for userspace to deal with PAGE_SIZE things. Reported-by: David Miller Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 087025f..5a69abb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3749,7 +3749,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, + .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, }; -- cgit v1.1 From d76a0812ac4139ceb54daab3cc70e1bd8bd9d43a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Feb 2010 17:06:01 +0200 Subject: perf_events: Add new start/stop PMU callbacks In certain situations, the kernel may need to stop and start the same event rapidly. The current PMU callbacks do not distinguish between stop and release (i.e., stop + free the resource). Thus, a counter may be released, then it will be immediately re-acquired. Event scheduling will again take place with no guarantee to assign the same counter. On some processors, this may event yield to failure to assign the event back due to competion between cores. This patch is adding a new pair of callback to stop and restart a counter without actually release the underlying counter resource. On stop, the counter is stopped, its values saved and that's it. On start, the value is reloaded and counter is restarted (on x86, actual restart is delayed until perf_enable()). Signed-off-by: Stephane Eranian [ added fallback to ->enable/->disable for all other PMUs fixed x86_pmu_start() to call x86_pmu.enable() merged __x86_pmu_disable into x86_pmu_stop() ] Signed-off-by: Peter Zijlstra LKML-Reference: <4b703875.0a04d00a.7896.ffffb824@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 5a69abb..74c6002 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1493,6 +1493,22 @@ do { \ return div64_u64(dividend, divisor); } +static void perf_event_stop(struct perf_event *event) +{ + if (!event->pmu->stop) + return event->pmu->disable(event); + + return event->pmu->stop(event); +} + +static int perf_event_start(struct perf_event *event) +{ + if (!event->pmu->start) + return event->pmu->enable(event); + + return event->pmu->start(event); +} + static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -1513,9 +1529,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) if (atomic64_read(&hwc->period_left) > 8*sample_period) { perf_disable(); - event->pmu->disable(event); + perf_event_stop(event); atomic64_set(&hwc->period_left, 0); - event->pmu->enable(event); + perf_event_start(event); perf_enable(); } } -- cgit v1.1 From 38331f62c20456454eed9ebea2525f072c6f1d2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Feb 2010 17:17:01 +0200 Subject: perf_events, x86: AMD event scheduling This patch adds correct AMD NorthBridge event scheduling. NB events are events measuring L3 cache, Hypertransport traffic. They are identified by an event code >= 0xe0. They measure events on the Northbride which is shared by all cores on a package. NB events are counted on a shared set of counters. When a NB event is programmed in a counter, the data actually comes from a shared counter. Thus, access to those counters needs to be synchronized. We implement the synchronization such that no two cores can be measuring NB events using the same counters. Thus, we maintain a per-NB allocation table. The available slot is propagated using the event_constraint structure. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4b703957.0702d00a.6bf2.7b7d@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 74c6002..fb4e56e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -98,6 +98,7 @@ void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_event_setup(int cpu) { barrier(); } void __weak hw_perf_event_setup_online(int cpu) { barrier(); } +void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_event *group_leader, @@ -5462,6 +5463,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_event_exit_cpu(cpu); break; + case CPU_DEAD: + hw_perf_event_setup_offline(cpu); + break; + default: break; } -- cgit v1.1 From 6e37738a2fac964583debe91099bc3248554f6e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Feb 2010 13:21:58 +0100 Subject: perf_events: Simplify code by removing cpu argument to hw_perf_group_sched_in() Since the cpu argument to hw_perf_group_sched_in() is always smp_processor_id(), simplify the code a little by removing this argument and using the current cpu where needed. Signed-off-by: Peter Zijlstra Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker LKML-Reference: <1265890918.5396.3.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fb4e56e..05b6c6b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -103,7 +103,7 @@ void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, int cpu) + struct perf_event_context *ctx) { return 0; } @@ -633,14 +633,13 @@ void perf_event_disable(struct perf_event *event) static int event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - int cpu) + struct perf_event_context *ctx) { if (event->state <= PERF_EVENT_STATE_OFF) return 0; event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + event->oncpu = smp_processor_id(); /* * The new state must be visible before we turn it on in the hardware: */ @@ -667,8 +666,7 @@ event_sched_in(struct perf_event *event, static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - int cpu) + struct perf_event_context *ctx) { struct perf_event *event, *partial_group; int ret; @@ -676,18 +674,18 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); if (ret) return ret < 0 ? ret : 0; - if (event_sched_in(group_event, cpuctx, ctx, cpu)) + if (event_sched_in(group_event, cpuctx, ctx)) return -EAGAIN; /* * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx, cpu)) { + if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } @@ -761,7 +759,6 @@ static void __perf_install_in_context(void *info) struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - int cpu = smp_processor_id(); int err; /* @@ -808,7 +805,7 @@ static void __perf_install_in_context(void *info) if (!group_can_go_on(event, cpuctx, 1)) err = -EEXIST; else - err = event_sched_in(event, cpuctx, ctx, cpu); + err = event_sched_in(event, cpuctx, ctx); if (err) { /* @@ -950,11 +947,9 @@ static void __perf_event_enable(void *info) } else { perf_disable(); if (event == leader) - err = group_sched_in(event, cpuctx, ctx, - smp_processor_id()); + err = group_sched_in(event, cpuctx, ctx); else - err = event_sched_in(event, cpuctx, ctx, - smp_processor_id()); + err = event_sched_in(event, cpuctx, ctx); perf_enable(); } @@ -1281,19 +1276,18 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, static void ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - int cpu) + struct perf_cpu_context *cpuctx) { struct perf_event *event; list_for_each_entry(event, &ctx->pinned_groups, group_entry) { if (event->state <= PERF_EVENT_STATE_OFF) continue; - if (event->cpu != -1 && event->cpu != cpu) + if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx, cpu); + group_sched_in(event, cpuctx, ctx); /* * If this pinned group hasn't been scheduled, @@ -1308,8 +1302,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, static void ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - int cpu) + struct perf_cpu_context *cpuctx) { struct perf_event *event; int can_add_hw = 1; @@ -1322,11 +1315,11 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, * Listen to the 'cpu' scheduling filter constraint * of events: */ - if (event->cpu != -1 && event->cpu != cpu) + if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; if (group_can_go_on(event, cpuctx, can_add_hw)) - if (group_sched_in(event, cpuctx, ctx, cpu)) + if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; } } @@ -1336,8 +1329,6 @@ ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - int cpu = smp_processor_id(); - raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) @@ -1352,11 +1343,11 @@ ctx_sched_in(struct perf_event_context *ctx, * in order to give them the best chance of going on. */ if (event_type & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx, cpu); + ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ if (event_type & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx, cpu); + ctx_flexible_sched_in(ctx, cpuctx); perf_enable(); out: -- cgit v1.1 From 24691ea964cc0123e386b661e03a86a481c6ee79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2010 16:36:23 +0100 Subject: perf_event: Fix preempt warning in perf_clock() A recent commit introduced a preemption warning for perf_clock(), use raw_smp_processor_id() to avoid this, it really doesn't matter which cpu we use here. Signed-off-by: Peter Zijlstra LKML-Reference: <1267198583.22519.684.camel@laptop> Cc: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 05b6c6b..aa6155b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -249,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(smp_processor_id()); + return cpu_clock(raw_smp_processor_id()); } /* -- cgit v1.1