diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 246 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 13 | ||||
-rw-r--r-- | kernel/irq/chip.c | 2 | ||||
-rw-r--r-- | kernel/jump_label.c | 135 | ||||
-rw-r--r-- | kernel/printk.c | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 18 | ||||
-rw-r--r-- | kernel/sched/fair.c | 8 | ||||
-rw-r--r-- | kernel/sched/sched.h | 14 | ||||
-rw-r--r-- | kernel/signal.c | 28 | ||||
-rw-r--r-- | kernel/softirq.c | 6 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 134 | ||||
-rw-r--r-- | kernel/trace/trace.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace.h | 38 | ||||
-rw-r--r-- | kernel/trace/trace_entries.h | 54 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 208 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 12 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 175 | ||||
-rw-r--r-- | kernel/trace/trace_export.c | 64 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 8 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 12 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 22 | ||||
-rw-r--r-- | kernel/tracepoint.c | 20 | ||||
-rw-r--r-- | kernel/watchdog.c | 24 |
23 files changed, 993 insertions, 259 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 1b5c081..c61234b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP) +/* + * branch priv levels that need permission checks + */ +#define PERF_SAMPLE_BRANCH_PERM_PLM \ + (PERF_SAMPLE_BRANCH_KERNEL |\ + PERF_SAMPLE_BRANCH_HV) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -128,8 +135,9 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (has_branch_stack(event)) + ctx->nr_branch_stack++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (has_branch_stack(event)) + ctx->nr_branch_stack--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, } /* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* no need to flush branch stack if not changing task */ + if (prev == task) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + /* + * check if the context has at least one + * event using PERF_SAMPLE_BRANCH_STACK + */ + if (cpuctx->ctx.nr_branch_stack > 0 + && pmu->flush_branch_stack) { + + pmu = cpuctx->ctx.pmu; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->flush_branch_stack(); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +/* * Called from scheduler to add the events of the current task * with interrupts disabled. * @@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + + /* check for system-wide branch_stack events */ + if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + perf_branch_stack_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); + } + + if (has_branch_stack(event)) { + static_key_slow_dec_deferred(&perf_sched_events); + /* is system-wide event */ + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, + event->cpu)); } } @@ -3238,10 +3324,6 @@ int perf_event_task_disable(void) return 0; } -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) @@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event) if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; + return event->pmu->event_idx(event); } static void calc_timer_values(struct perf_event *event, + u64 *now, u64 *enabled, u64 *running) { - u64 now, ctx_time; + u64 ctx_time; - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; *enabled = ctx_time - event->tstamp_enabled; *running = ctx_time - event->tstamp_running; } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; - u64 enabled, running; + u64 enabled, running, now; rcu_read_lock(); /* @@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event) * because of locking issue as we can be called in * NMI context */ - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event) barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) + if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + @@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); + perf_update_user_clock(userpg, now); + barrier(); ++userpg->lock; preempt_enable(); @@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) event->mmap_user = get_current_user(); vma->vm_mm->pinned_vm += event->mmap_locked; + perf_event_update_userpage(event); + unlock: if (!ret) atomic_inc(&event->mmap_count); @@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0; + u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* @@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle, * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); @@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle, } } } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { + size_t size; + + size = data->br_stack->nr + * sizeof(struct perf_branch_entry); + + perf_output_put(handle, data->br_stack->nr); + perf_output_copy(handle, data->br_stack->entries, size); + } else { + /* + * we always store at least the value of nr + */ + u64 nr = 0; + perf_output_put(handle, nr); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header, WARN_ON_ONCE(size & (sizeof(u64)-1)); header->size += size; } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + int size = sizeof(u64); /* nr */ + if (data->br_stack) { + size += data->br_stack->nr + * sizeof(struct perf_branch_entry); + } + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -4986,7 +5104,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: case PERF_COUNT_SW_TASK_CLOCK: @@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } return 0; } +static int perf_swevent_event_idx(struct perf_event *event) +{ + return 0; +} + static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + err = perf_trace_init(event); if (err) return err; @@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; /* @@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) perf_pmu_enable(pmu); } +static int perf_event_idx_default(struct perf_event *event) +{ + return event->hw.idx + 1; +} + /* * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. @@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu) if (!pmu->dev) goto out; + pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) @@ -5633,6 +5794,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->event_idx) + pmu->event_idx = perf_event_idx_default; + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: @@ -5825,7 +5989,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5839,6 +6003,12 @@ done: return ERR_PTR(err); } } + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } return event; @@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { + u64 mask = attr->branch_sample_type; + + /* only using defined bits */ + if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) + return -EINVAL; + + /* at least one branch bit must be set */ + if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) + return -EINVAL; + + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* propagate priv level, when not set for branch */ + if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { + + /* exclude_kernel checked on syscall entry */ + if (!attr->exclude_kernel) + mask |= PERF_SAMPLE_BRANCH_KERNEL; + + if (!attr->exclude_user) + mask |= PERF_SAMPLE_BRANCH_USER; + + if (!attr->exclude_hv) + mask |= PERF_SAMPLE_BRANCH_HV; + /* + * adjust user setting (for HW filter setup) + */ + attr->branch_sample_type = mask; + } + } out: return ret; @@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index ee706ce..bb38c4d 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp) if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; + /* + * no branch sampling for breakpoint events + */ + if (has_branch_stack(bp)) + return -EOPNOTSUPP; + err = register_perf_hw_breakpoint(bp); if (err) return err; @@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } +static int hw_breakpoint_event_idx(struct perf_event *bp) +{ + return 0; +} + static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 25784d6..6080f6b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -16,6 +16,8 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <trace/events/irq.h> + #include "internals.h" /** diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01d3b70..4304919 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> -#include <linux/jump_label.h> +#include <linux/static_key.h> #ifdef HAVE_JUMP_LABEL @@ -29,11 +29,6 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) -{ - return !!atomic_read(&key->enabled); -} - static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { + WARN(atomic_read(&key->enabled) < 0, + "jump label: negative count!\n"); return; + } if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -197,8 +215,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES - struct module *mod = __module_address((jump_label_t)key); + struct module *mod = __module_address((unsigned long)key); __jump_label_mod_update(key, enable); diff --git a/kernel/printk.c b/kernel/printk.c index 32690a0..0b3ea2c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -44,6 +44,9 @@ #include <asm/uaccess.h> +#define CREATE_TRACE_POINTS +#include <trace/events/printk.h> + /* * Architectures can override it: */ @@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" static void _call_console_drivers(unsigned start, unsigned end, int msg_log_level) { + trace_console(&LOG_BUF(0), start, end, log_buf_len); + if ((msg_log_level < console_loglevel || ignore_loglevel) && console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b342f57..6c41ba4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -2755,7 +2755,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aca16b8..fd974fa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1401,20 +1401,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c26..b4cd6d8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include <linux/jump_label.h> +# include <linux/static_key.h> # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +630,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/signal.c b/kernel/signal.c index c73c428..8511e39 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1054,13 +1054,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigpending *pending; struct sigqueue *q; int override_rlimit; - - trace_signal_generate(sig, info, t); + int ret = 0, result; assert_spin_locked(&t->sighand->siglock); + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, from_ancestor_ns)) - return 0; + goto ret; pending = group ? &t->signal->shared_pending : &t->pending; /* @@ -1068,8 +1068,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ + result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) - return 0; + goto ret; + + result = TRACE_SIGNAL_DELIVERED; /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. @@ -1127,14 +1130,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * signal was rt and sent by user using something * other than kill(). */ - trace_signal_overflow_fail(sig, group, info); - return -EAGAIN; + result = TRACE_SIGNAL_OVERFLOW_FAIL; + ret = -EAGAIN; + goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ - trace_signal_lose_info(sig, group, info); + result = TRACE_SIGNAL_LOSE_INFO; } } @@ -1142,7 +1146,9 @@ out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); - return 0; +ret: + trace_signal_generate(sig, info, t, group, result); + return ret; } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, @@ -1585,7 +1591,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) int sig = q->info.si_signo; struct sigpending *pending; unsigned long flags; - int ret; + int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); @@ -1594,6 +1600,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) goto ret; ret = 1; /* the signal is ignored */ + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, 0)) goto out; @@ -1605,6 +1612,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) */ BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; + result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } q->info.si_overrun = 0; @@ -1614,7 +1622,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); + result = TRACE_SIGNAL_DELIVERED; out: + trace_signal_generate(sig, &q->info, t, group, result); unlock_task_sighand(t, &flags); ret: return ret; diff --git a/kernel/softirq.c b/kernel/softirq.c index c82d95a..8afc6a8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -375,6 +375,12 @@ void raise_softirq(unsigned int nr) local_irq_restore(flags); } +void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise(nr); + or_softirq_pending(1UL << nr); +} + void open_softirq(int nr, void (*action)(struct softirq_action *)) { softirq_vec[nr].action = action; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 683d559..867bd1d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,6 +62,8 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; @@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { }; static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); @@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif +static void control_ops_disable_all(struct ftrace_ops *ops) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(ops->disabled, cpu) = 1; +} + +static int control_ops_alloc(struct ftrace_ops *ops) +{ + int __percpu *disabled; + + disabled = alloc_percpu(int); + if (!disabled) + return -ENOMEM; + + ops->disabled = disabled; + control_ops_disable_all(ops); + return 0; +} + +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void update_global_ops(void) { ftrace_func_t func; @@ -259,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } +static void add_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int first = *list == &ftrace_list_end; + add_ftrace_ops(list, ops); + if (first) + add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int ret = remove_ftrace_ops(list, ops); + if (!ret && *list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); + return ret; +} + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ftrace_disabled) @@ -270,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; + /* We don't support both control and global flags set. */ + if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) + return -EINVAL; + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - int first = ftrace_global_list == &ftrace_list_end; - add_ftrace_ops(&ftrace_global_list, ops); + add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); ops->flags |= FTRACE_OPS_FL_ENABLED; - if (first) - add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (control_ops_alloc(ops)) + return -ENOMEM; + add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else add_ftrace_ops(&ftrace_ops_list, ops); @@ -302,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -EINVAL; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_ops(&ftrace_global_list, ops); - if (!ret && ftrace_global_list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + ret = remove_ftrace_list_ops(&ftrace_global_list, + &global_ops, ops); if (!ret) ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); + if (!ret) { + /* + * The ftrace_ops is now removed from the list, + * so there'll be no new users. We must ensure + * all current users are done before we free + * the control data. + */ + synchronize_sched(); + control_ops_free(ops); + } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -1119,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); } +void ftrace_free_filter(struct ftrace_ops *ops) +{ + free_ftrace_hash(ops->filter_hash); + free_ftrace_hash(ops->notrace_hash); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; @@ -1129,7 +1202,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return NULL; size = 1 << size_bits; - hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); if (!hash->buckets) { kfree(hash); @@ -3146,8 +3219,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); - if (buf) - ftrace_match_records(hash, buf, len); + if (buf && !ftrace_match_records(hash, buf, len)) { + ret = -EINVAL; + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3157,6 +3232,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); + out_regex_unlock: mutex_unlock(&ftrace_regex_lock); free_ftrace_hash(hash); @@ -3173,10 +3249,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 1); + return ftrace_set_regex(ops, buf, len, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3191,10 +3267,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 0); + return ftrace_set_regex(ops, buf, len, reset, 0); } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** @@ -3871,6 +3947,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + trace_recursion_set(TRACE_CONTROL_BIT); + op = rcu_dereference_raw(ftrace_control_list); + while (op != &ftrace_list_end) { + if (!ftrace_function_local_disabled(op) && + ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + + op = rcu_dereference_raw(op->next); + }; + trace_recursion_clear(TRACE_CONTROL_BIT); + preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { + .func = ftrace_ops_control_func, +}; + +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f1bc5..10d5503 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2764,12 +2764,12 @@ static const char readme_msg[] = "tracing mini-HOWTO:\n\n" "# mount -t debugfs nodev /sys/kernel/debug\n\n" "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" + "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" "nop\n" - "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" + "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" - "sched_switch\n" + "wakeup\n" "# cat /sys/kernel/debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecba..54faec7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,17 +56,23 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ - struct struct_name { \ - struct trace_entry ent; \ - tstruct \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ } #undef TP_ARGS #define TP_ARGS(args...) args #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -288,6 +294,8 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) #define TRACE_GLOBAL_BIT (1<<12) +#define TRACE_CONTROL_BIT (1<<13) + /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function @@ -589,6 +597,8 @@ static inline int ftrace_trace_task(struct task_struct *task) static inline int ftrace_is_dead(void) { return 0; } #endif +int ftrace_event_is_function(struct ftrace_event_call *call); + /* * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found @@ -766,9 +776,7 @@ struct filter_pred { u64 val; struct regex regex; unsigned short *ops; -#ifdef CONFIG_FTRACE_STARTUP_TEST struct ftrace_event_field *field; -#endif int offset; int not; int op; @@ -818,12 +826,22 @@ extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" +#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_FUNCTION_TRACER +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_PERF_EVENTS */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 9336590..d91eb05 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -55,7 +55,7 @@ /* * Function trace entry - function address and parent function address: */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, @@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + + FILTER_TRACE_FN, + + perf_ftrace_event_register ); /* Function call entry */ @@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth), + + FILTER_OTHER ); /* Function return entry */ @@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth) + __entry->depth), + + FILTER_OTHER ); /* @@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -169,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -185,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); /* @@ -202,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt) + __entry->ip, __entry->fmt), + + FILTER_OTHER ); FTRACE_ENTRY(print, print_entry, @@ -215,7 +231,9 @@ FTRACE_ENTRY(print, print_entry, ), F_printk("%08lx %s", - __entry->ip, __entry->buf) + __entry->ip, __entry->buf), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -234,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width) + __entry->map_id, __entry->opcode, __entry->width), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -252,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode) + __entry->map_id, __entry->opcode), + + FILTER_OTHER ); @@ -272,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)", __entry->line, - __entry->func, __entry->file, __entry->correct) + __entry->func, __entry->file, __entry->correct), + + FILTER_OTHER ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d..fee3752 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,11 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; @@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, - struct perf_event *p_event) +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret; + int ret = -ENOMEM; int cpu; - ret = perf_trace_event_perm(tp_event, p_event); - if (ret) - return ret; - p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; - ret = -ENOMEM; - list = alloc_percpu(struct hlist_head); if (!list) goto fail; @@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; @@ -108,6 +107,69 @@ fail: return ret; } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; @@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event) return ret; } +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); +} + int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; @@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags) list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); - return 0; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); } void perf_trace_del(struct perf_event *p_event, int flags) { - hlist_del_rcu(&p_event->hlist_entry); -} - -void perf_trace_destroy(struct perf_event *p_event) -{ struct ftrace_event_call *tp_event = p_event->tp_event; - int i; - - mutex_lock(&event_mutex); - if (--tp_event->perf_refcount > 0) - goto out; - - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - - /* - * Ensure our callback won't be called anymore. The buffers - * will be freed after that. - */ - tracepoint_synchronize_unregister(); - - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - - if (!--total_ref_count) { - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } -out: - module_put(tp_event->mod); - mutex_unlock(&event_mutex); + hlist_del_rcu(&p_event->hlist_entry); + tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, @@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_entry *entry; + struct hlist_head *head; + struct pt_regs regs; + int rctx; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + if (!entry) + return; + + entry->ip = ip; + entry->parent_ip = parent_ip; + + head = this_cpu_ptr(event_function.perf_events); + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + 1, ®s, head); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->func = perf_ftrace_function_call; + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ + ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ + ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + perf_ftrace_function_enable(data); + return 0; + case TRACE_REG_PERF_DEL: + perf_ftrace_function_disable(data); + return 0; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f..079a93a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +int ftrace_event_reg(struct ftrace_event_call *call, + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) call->class->perf_probe, call); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; @@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_stop_cmdline_record(); call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; } - call->class->reg(call, TRACE_REG_UNREGISTER); + call->class->reg(call, TRACE_REG_UNREGISTER, NULL); } break; case 1: @@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_start_cmdline_record(); call->flags |= TRACE_EVENT_FL_RECORDED_CMD; } - ret = call->class->reg(call, TRACE_REG_REGISTER); + ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 24aee71..431dba8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -81,6 +81,7 @@ enum { FILT_ERR_TOO_MANY_PREDS, FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, + FILT_ERR_IP_FIELD_ONLY, }; static char *err_text[] = { @@ -96,6 +97,7 @@ static char *err_text[] = { "Too many terms in predicate expression", "Missing field name and/or value", "Meaningless filter expression", + "Only 'ip' field is supported for function trace", }; struct opstack_op { @@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name) static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { - stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); if (!stack->preds) return -ENOMEM; stack->index = n_preds; @@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) __free_preds(filter); - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); if (!filter->preds) return -ENOMEM; @@ -900,6 +901,11 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } +static bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || @@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; + } else if (is_function_field(field)) { + if (strcmp(field->name, "ip")) { + parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); + return -EINVAL; + } } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); @@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST pred.field = field; -#endif return init_pred(ps, field, &pred) ? NULL : &pred; } @@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) children = count_leafs(preds, &preds[root->left]); children += count_leafs(preds, &preds[root->right]); - root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); if (!root->ops) return -ENOMEM; @@ -1950,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event) __free_filter(filter); } +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, *sep, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; + + /* + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. + */ + while ((sep = strchr(str, ','))) + *sep = ' '; + + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; + + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); + + return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separatelly. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ + struct ftrace_event_field *field = pred->field; + + if (leaf) { + /* + * Check the leaf predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + } else { + /* + * Check the non leaf predicate for function trace, verify: + * - only '||' is used + */ + if (pred->op != OP_OR) + return -EINVAL; + } + + return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, + struct filter_pred *pred, + int *err, void *data) +{ + /* Checking the node is valid for function trace. */ + if ((move != MOVE_DOWN) || + (pred->left != FILTER_PRED_INVALID)) { + *err = ftrace_function_check_pred(pred, 0); + } else { + *err = ftrace_function_check_pred(pred, 1); + if (*err) + return WALK_PRED_ABORT; + + *err = __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); + } + + return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + + return walk_pred_tree(filter->preds, filter->root, + ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { @@ -1970,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; err = create_filter(call, filter_str, false, &filter); - if (!err) - event->filter = filter; + if (err) + goto free_filter; + + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); else + event->filter = filter; + +free_filter: + if (err || ftrace_event_is_function(call)) __free_filter(filter); out_unlock: diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31..7b46c9b 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + /* not needed for this file */ #undef __field_struct #define __field_struct(type, item) @@ -44,21 +54,22 @@ #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -struct ____ftrace_##name { \ - tstruct \ -}; \ -static void __always_unused ____ftrace_check_##name(void) \ -{ \ - struct ____ftrace_##name *__entry = NULL; \ - \ - /* force compile-time check on F_printk() */ \ - printk(print); \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ @@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __dynamic_array(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - 0, is_signed_type(type), FILTER_OTHER);\ + 0, is_signed_type(type), filter_type);\ if (ret) \ return ret; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ int \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ + int filter_type = filter; \ \ tstruct; \ \ @@ -152,13 +164,15 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef F_printk #define F_printk(fmt, args...) #fmt ", " __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ + regfn) \ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ }; \ \ struct ftrace_event_call __used event_##call = { \ @@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = { \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), filter, NULL) + +int ftrace_event_is_function(struct ftrace_event_call *call) +{ + return call == &event_function; +} + #include "trace_entries.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c..580a05e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, #endif /* CONFIG_PERF_EVENTS */ static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_probe *tp = (struct trace_probe *)event->data; @@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) case TRACE_REG_PERF_UNREGISTER: disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff35..c5a0187 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long mask; const char *str; const char *ret = p->buffer + p->len; - int i; + int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, str = flag_array[i].name; flags &= ~mask; - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); + else + first = 0; trace_seq_puts(p, str); } /* check for left over flags */ if (flags) { - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); trace_seq_printf(p, "0x%lx", flags); } @@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb65454..96fc733 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); @@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void) unsigned long addr; int i; - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); + syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), + GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return -ENOMEM; @@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysenter_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; } static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysexit_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539de..d96ba22 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/sched.h> -#include <linux/jump_label.h> +#include <linux/static_key.h> extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d117262..14bc092 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -3,12 +3,9 @@ * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * - * this code detects hard lockups: incidents in where on a CPU - * the kernel does not respond to anything except NMI. - * - * Note: Most of this code is borrowed heavily from softlockup.c, - * so thanks to Ingo for the initial implementation. - * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks * to those contributors as well. */ @@ -117,9 +114,10 @@ static unsigned long get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns - * the divide by 5 is to give hrtimer 5 chances to - * increment before the hardlockup detector generates - * a warning + * the divide by 5 is to give hrtimer several chances (two + * or three with the current relation between the soft + * and hard thresholds) to increment before the + * hardlockup detector generates a warning */ return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } @@ -336,9 +334,11 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in watchdog_timer_fn(). + * Run briefly (kicked by the hrtimer callback function) once every + * get_sample_period() seconds (4 seconds by default) to reset the + * softlockup timestamp. If this gets delayed for more than + * 2*watchdog_thresh seconds then the debug-printout triggers in + * watchdog_timer_fn(). */ while (!kthread_should_stop()) { __touch_watchdog(); |