From 723478c8a471403c53cf144999701f6e0c4bbd11 Mon Sep 17 00:00:00 2001 From: Knut Petersen Date: Wed, 25 Sep 2013 14:29:37 +0200 Subject: perf: Enforce 1 as lower limit for perf_event_max_sample_rate /proc/sys/kernel/perf_event_max_sample_rate will accept negative values as well as 0. Negative values are unreasonable, and 0 causes a divide by zero exception in perf_proc_update_handler. This patch enforces a lower limit of 1. Signed-off-by: Knut Petersen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5242DB0C.4070005@t-online.de Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d2..b25d65c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; -- cgit v1.1 From fdfbbd07e91f8fe387140776f3fd94605f0c89e5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 20 Sep 2013 07:40:39 -0700 Subject: perf: Add generic transaction flags Add a generic qualifier for transaction events, as a new sample type that returns a flag word. This is particularly useful for qualifying aborts: to distinguish aborts which happen due to asynchronous events (like conflicts caused by another CPU) versus instructions that lead to an abort. The tuning strategies are very different for those cases, so it's important to distinguish them easily and early. Since it's inconvenient and inflexible to filter for this in the kernel we report all the events out and allow some post processing in user space. The flags are based on the Intel TSX events, but should be fairly generic and mostly applicable to other HTM architectures too. In addition to various flag words there's also reserved space to report an program supplied abort code. For TSX this is used to distinguish specific classes of aborts, like a lock busy abort when doing lock elision. Flags: Elision and generic transactions (ELISION vs TRANSACTION) (HLE vs RTM on TSX; IBM etc. would likely only use TRANSACTION) Aborts caused by current thread vs aborts caused by others (SYNC vs ASYNC) Retryable transaction (RETRY) Conflicts with other threads (CONFLICT) Transaction write capacity overflow (CAPACITY WRITE) Transaction read capacity overflow (CAPACITY READ) Transactions implicitely aborted can also return an abort code. This can be used to signal specific events to the profiler. A common case is abort on lock busy in a RTM eliding library (code 0xff) To handle this case we include the TSX abort code Common example aborts in TSX would be: - Data conflict with another thread on memory read. Flags: TRANSACTION|ASYNC|CONFLICT - executing a WRMSR in a transaction. Flags: TRANSACTION|SYNC - HLE transaction in user space is too large Flags: ELISION|SYNC|CAPACITY-WRITE The only flag that is somewhat TSX specific is ELISION. This adds the perf core glue needed for reporting the new flag word out. v2: Add MEM/MISC v3: Move transaction to the end v4: Separate capacity-read/write and remove misc v5: Remove _SAMPLE. Move abort flags to 32bit. Rename transaction to txn Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379688044-14173-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index b25d65c..c716385 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1201,6 +1201,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + size += sizeof(data->txn); + event->header_size = size; } @@ -4572,6 +4575,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + perf_output_put(handle, data->txn); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; -- cgit v1.1 From d9494cb4299da66541a3f3ab82c552889bee0606 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Oct 2013 15:36:19 +0200 Subject: perf: Remove useless atomic_t There's nothing atomic about atomic_set vs atomic_read; so remove the atomic_t usage. Also, make running_sample_length static as it really is (and should be) local to this translation unit. Signed-off-by: Peter Zijlstra Cc: eranian@google.com Cc: Don Zickus Cc: jmario@redhat.com Cc: acme@infradead.org Cc: dave.hansen@linux.intel.com Link: http://lkml.kernel.org/n/tip-vw9lg588x1ic248whybjon0c@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bd7fe4..028dad9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; -static atomic_t perf_sample_allowed_ns __read_mostly = - ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); +static int perf_sample_allowed_ns __read_mostly = + DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; void update_perf_cpu_limits(void) { @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) tmp *= sysctl_perf_cpu_time_max_percent; do_div(tmp, 100); - atomic_set(&perf_sample_allowed_ns, tmp); + ACCESS_ONCE(perf_sample_allowed_ns) = tmp; } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 -DEFINE_PER_CPU(u64, running_sample_length); +static DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; u64 local_samples_len; + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - if (atomic_read(&perf_sample_allowed_ns) == 0) + if (allowed_ns == 0) return; /* decay the counter by 1 average sample */ @@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) */ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + if (avg_local_sample_len <= allowed_ns) return; if (max_samples_per_tick <= 1) @@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %d), lowering " + "perf samples too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, - atomic_read(&perf_sample_allowed_ns), + avg_local_sample_len, allowed_ns, sysctl_perf_event_sample_rate); update_perf_cpu_limits(); -- cgit v1.1 From 32c5fb7e7d18b4fd37c5e29dea731151e9d66866 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 16 Oct 2013 22:09:45 +0200 Subject: perf: Kill the dead !vma->vm_mm code in perf_event_mmap_event() 1. perf_event_mmap(vma) is never called with a gate_vma-like arg, remove the "if (!vma->vm_mm)" code. 2. arch_vma_name() can use the chached value of mmap_event->vma. 3. Change the code to not call arch_vma_name() twice. 4. Purely cosmetic, but since we use "goto got_name" all the time remove "else" from "[stack]" branch just for symmetry. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131016200945.GB23214@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 028dad9..3ea5605 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5136,21 +5136,19 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) min = MINOR(dev); } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp) - 1); + name = arch_vma_name(vma); + if (name) { + name = strncpy(tmp, name, sizeof(tmp) - 1); tmp[sizeof(tmp) - 1] = '\0'; goto got_name; } - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && + if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { name = strncpy(tmp, "[heap]", sizeof(tmp)); goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && + } + if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { name = strncpy(tmp, "[stack]", sizeof(tmp)); goto got_name; -- cgit v1.1 From 3ea2f2b96f9e636f49eb10962e96db3e19cab157 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 16 Oct 2013 22:10:04 +0200 Subject: perf: Do not waste PAGE_SIZE bytes for ALIGN(8) in perf_event_mmap_event() perf_event_mmap_event() does kzalloc(PATH_MAX + sizeof(u64)) to ensure we can align the size later. However this means that we actually allocate PAGE_SIZE * 2 buffer, seems too much. Change this code to allocate PATH_MAX==PAGE_SIZE bytes, but tell d_path() to not use the last sizeof(u64) bytes. Note: it is not clear why do we need __GFP_ZERO, see the next patch. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131016201004.GC23214@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3ea5605..b409e75 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5113,17 +5113,18 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (file) { struct inode *inode; dev_t dev; - /* - * d_path works from the end of the rb backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + + buf = kzalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; } - name = d_path(&file->f_path, buf, PATH_MAX); + /* + * d_path() works from the end of the rb backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = strncpy(tmp, "//toolong", sizeof(tmp)); goto got_name; -- cgit v1.1 From 2c42cfbfe10872929c2ba1f8130e31063ff59b94 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Oct 2013 00:06:46 +0200 Subject: perf: Change zero-padding of strings in perf_event_mmap_event() Oleg complained about the excessive 0-ing in perf_event_mmap_event(), so try and be smarter about it while keeping it fairly fool proof and avoid leaking random bits out to userspace. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-8jirlm99m6if2z13wd6rbyu6@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index b409e75..85a8bbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5106,15 +5106,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); + char *name; if (file) { struct inode *inode; dev_t dev; - buf = kzalloc(PATH_MAX, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; @@ -5137,7 +5135,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) min = MINOR(dev); } else { - name = arch_vma_name(vma); + name = (char *)arch_vma_name(vma); if (name) { name = strncpy(tmp, name, sizeof(tmp) - 1); tmp[sizeof(tmp) - 1] = '\0'; @@ -5160,7 +5158,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(name)+1; + while (!IS_ALIGNED(size, sizeof(u64))) + name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; -- cgit v1.1 From 5a3126d4fe7c311fe12f98fef0470f6cb582d1ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 17:12:48 +0200 Subject: perf: Fix the perf context switch optimization Currently we only optimize the context switch between two contexts that have the same parent; this forgoes the optimization between parent and child context, even though these contexts could be equivalent too. Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Adrian Hunter Cc: Shishkin, Alexander Link: http://lkml.kernel.org/r/20131007164257.GH3081@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 64 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 18 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 85a8bbd..17b3c6c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); ctx->parent_ctx = NULL; } + ctx->generation++; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; + + ctx->generation++; } /* @@ -1313,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; + + ctx->generation++; } static void perf_group_detach(struct perf_event *event) @@ -2149,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, } /* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. + * Test whether two contexts are equivalent, i.e. whether they have both been + * cloned from the same version of the same context. + * + * Equivalence is measured using a generation number in the context that is + * incremented on each modification to it; see unclone_ctx(), list_add_event() + * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; + /* Pinning disables the swap optimization */ + if (ctx1->pin_count || ctx2->pin_count) + return 0; + + /* If ctx1 is the parent of ctx2 */ + if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) + return 1; + + /* If ctx2 is the parent of ctx1 */ + if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) + return 1; + + /* + * If ctx1 and ctx2 have the same parent; we flatten the parent + * hierarchy, see perf_event_init_context(). + */ + if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && + ctx1->parent_gen == ctx2->parent_gen) + return 1; + + /* Unmatched */ + return 0; } static void __perf_event_sync_stat(struct perf_event *event, @@ -2247,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; - struct perf_event_context *parent; + struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; @@ -2259,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, return; rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { + if (!next_ctx) + goto unlock; + + parent = rcu_dereference(ctx->parent_ctx); + next_parent = rcu_dereference(next_ctx->parent_ctx); + + /* If neither context have a parent context; they cannot be clones. */ + if (!parent && !next_parent) + goto unlock; + + if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both @@ -2290,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } +unlock: rcu_read_unlock(); if (do_switch) { @@ -7136,7 +7166,6 @@ SYSCALL_DEFINE5(perf_event_open, } perf_install_in_context(ctx, event, event->cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -7219,7 +7248,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); -- cgit v1.1 From b68e0749100e1b901bf11330f149b321c082178e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Oct 2013 21:18:31 +0200 Subject: uprobes: Change the callsite of uprobe_copy_process() Preparation for the next patches. Move the callsite of uprobe_copy_process() in copy_process() down to the succesfull return. We do not care if copy_process() fails, uprobe_free_utask() won't be called in this case so the wrong ->utask != NULL doesn't matter. OTOH, with this change we know that copy_process() can't fail when uprobe_copy_process() is called, the new task should either return to user-mode or call do_exit(). This way uprobe_copy_process() can: 1. setup p->utask != NULL if necessary 2. setup uprobes_state.xol_area 3. use task_work_add(p) Also, move the definition of uprobe_copy_process() down so that it can see get_utask(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bd..db7a1dc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1345,14 +1345,6 @@ void uprobe_free_utask(struct task_struct *t) } /* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t) -{ - t->utask = NULL; -} - -/* * Allocate a uprobe_task object for the task if if necessary. * Called when the thread hits a breakpoint. * @@ -1368,6 +1360,14 @@ static struct uprobe_task *get_utask(void) } /* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t) +{ + t->utask = NULL; +} + +/* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * -- cgit v1.1 From 6441ec8b7c108b72789d120562b9f1d976e4aaaf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Oct 2013 21:18:35 +0200 Subject: uprobes: Introduce __create_xol_area() No functional changes, preparation. Extract the code which actually allocates/installs the new area into the new helper, __create_xol_area(). While at it remove the unnecessary "ret = ENOMEM" and "ret = 0" in xol_add_vma(), they both have no effect. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index db7a1dc..ad17d81 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1096,16 +1096,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon } /* Slot allocation for XOL */ -static int xol_add_vma(struct xol_area *area) +static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - struct mm_struct *mm = current->mm; int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; - ret = -ENOMEM; /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (area->vaddr & ~PAGE_MASK) { @@ -1120,28 +1118,17 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; - ret = 0; fail: up_write(&mm->mmap_sem); return ret; } -/* - * get_xol_area - Allocate process's xol_area if necessary. - * This area will be used for storing instructions for execution out of line. - * - * Returns the allocated area or NULL. - */ -static struct xol_area *get_xol_area(void) +static struct xol_area *__create_xol_area(void) { struct mm_struct *mm = current->mm; - struct xol_area *area; uprobe_opcode_t insn = UPROBE_SWBP_INSN; - - area = mm->uprobes_state.xol_area; - if (area) - goto ret; + struct xol_area *area; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) @@ -1155,13 +1142,13 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; - /* allocate first slot of task's xol_area for the return probes */ + init_waitqueue_head(&area->wq); + /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); atomic_set(&area->slot_count, 1); - init_waitqueue_head(&area->wq); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); - if (!xol_add_vma(area)) + if (!xol_add_vma(mm, area)) return area; __free_page(area->page); @@ -1170,9 +1157,25 @@ static struct xol_area *get_xol_area(void) free_area: kfree(area); out: + return NULL; +} + +/* + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *get_xol_area(void) +{ + struct mm_struct *mm = current->mm; + struct xol_area *area; + + if (!mm->uprobes_state.xol_area) + __create_xol_area(); + area = mm->uprobes_state.xol_area; - ret: - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ return area; } -- cgit v1.1 From af0d95af79773f7637107cd3871aaabcb425f15a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Oct 2013 21:18:38 +0200 Subject: uprobes: Teach __create_xol_area() to accept the predefined vaddr Currently xol_add_vma() uses get_unmapped_area() for area->vaddr, but the next patches need to use the fixed address. So this patch adds the new "vaddr" argument to __create_xol_area() which should be used as area->vaddr if it is nonzero. xol_add_vma() doesn't bother to verify that the predefined addr is not used, insert_vm_struct() should fail if find_vma_links() detects the overlap with the existing vma. Also, __create_xol_area() doesn't need __GFP_ZERO to allocate area. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad17d81..7d12a45 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1104,11 +1104,14 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) if (mm->uprobes_state.xol_area) goto fail; - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { - ret = area->vaddr; - goto fail; + if (!area->vaddr) { + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, + PAGE_SIZE, 0, 0); + if (area->vaddr & ~PAGE_MASK) { + ret = area->vaddr; + goto fail; + } } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, @@ -1124,13 +1127,13 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -static struct xol_area *__create_xol_area(void) +static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct xol_area *area; - area = kzalloc(sizeof(*area), GFP_KERNEL); + area = kmalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1142,6 +1145,7 @@ static struct xol_area *__create_xol_area(void) if (!area->page) goto free_bitmap; + area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); @@ -1172,7 +1176,7 @@ static struct xol_area *get_xol_area(void) struct xol_area *area; if (!mm->uprobes_state.xol_area) - __create_xol_area(); + __create_xol_area(0); area = mm->uprobes_state.xol_area; smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ -- cgit v1.1 From 248d3a7b2f100078c5f6878351177859380582e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Oct 2013 21:18:41 +0200 Subject: uprobes: Change uprobe_copy_process() to dup return_instances uprobe_copy_process() assumes that the new child doesn't need ->utask, it should be allocated by demand. But this is not true if the forking task has the pending ret- probes, the child should report them as well and thus it needs the copy of parent's ->return_instances chain. Otherwise the child crashes when it returns from the probed function. Alternatively we could cleanup the child's stack, but this needs per-arch changes and this is not what we want. At least systemtap expects a .return in the child too. Note: this change alone doesn't fix the problem, see the next change. Reported-by: Martin Cermak Reported-by: David Smith Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7d12a45..1c6cda6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1366,12 +1366,55 @@ static struct uprobe_task *get_utask(void) return current->utask; } +static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) +{ + struct uprobe_task *n_utask; + struct return_instance **p, *o, *n; + + n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + if (!n_utask) + return -ENOMEM; + t->utask = n_utask; + + p = &n_utask->return_instances; + for (o = o_utask->return_instances; o; o = o->next) { + n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!n) + return -ENOMEM; + + *n = *o; + atomic_inc(&n->uprobe->ref); + n->next = NULL; + + *p = n; + p = &n->next; + n_utask->depth++; + } + + return 0; +} + +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", + current->comm, current->pid, msg); +} + /* * Called in context of a new clone/fork from copy_process. */ void uprobe_copy_process(struct task_struct *t) { + struct uprobe_task *utask = current->utask; + struct mm_struct *mm = current->mm; + t->utask = NULL; + + if (mm == t->mm || !utask || !utask->return_instances) + return; + + if (dup_utask(t, utask)) + return uprobe_warn(t, "dup ret instances"); } /* -- cgit v1.1 From aa59c53fd4599c91ccf9629af0c2777b89929076 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Oct 2013 21:18:44 +0200 Subject: uprobes: Change uprobe_copy_process() to dup xol_area This finally fixes the serious bug in uretprobes: a forked child crashes if the parent called fork() with the pending ret probe. Trivial test-case: # perf probe -x /lib/libc.so.6 __fork%return # perf record -e probe_libc:__fork perl -le 'fork || print "OK"' (the child doesn't print "OK", it is killed by SIGSEGV) If the child returns from the probed function it actually returns to trampoline_vaddr, because it got the copy of parent's stack mangled by prepare_uretprobe() when the parent entered this func. It crashes because a) this address is not mapped and b) until the previous change it doesn't have the proper->return_instances info. This means that uprobe_copy_process() has to create xol_area which has the trampoline slot, and its vaddr should be equal to parent's xol_area->vaddr. Unfortunately, uprobe_copy_process() can not simply do __create_xol_area(child, xol_area->vaddr). This could actually work but perf_event_mmap() doesn't expect the usage of foreign ->mm. So we offload this to task_work_run(), and pass the argument via not yet used utask->vaddr. We know that this vaddr is fine for install_special_mapping(), the necessary hole was recently "created" by dup_mmap() which skips the parent's VM_DONTCOPY area, and nobody else could use the new mm. Unfortunately, this also means that we can not handle the errors properly, we obviously can not abort the already completed fork(). So we simply print the warning if GFP_KERNEL allocation (the only possible reason) fails. Reported-by: Martin Cermak Reported-by: David Smith Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1c6cda6..9f282e1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -35,6 +35,7 @@ #include /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ #include +#include #include @@ -1400,6 +1401,17 @@ static void uprobe_warn(struct task_struct *t, const char *msg) current->comm, current->pid, msg); } +static void dup_xol_work(struct callback_head *work) +{ + kfree(work); + + if (current->flags & PF_EXITING) + return; + + if (!__create_xol_area(current->utask->vaddr)) + uprobe_warn(current, "dup xol area"); +} + /* * Called in context of a new clone/fork from copy_process. */ @@ -1407,6 +1419,8 @@ void uprobe_copy_process(struct task_struct *t) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; + struct callback_head *work; + struct xol_area *area; t->utask = NULL; @@ -1415,6 +1429,20 @@ void uprobe_copy_process(struct task_struct *t) if (dup_utask(t, utask)) return uprobe_warn(t, "dup ret instances"); + + /* The task can fork() after dup_xol_work() fails */ + area = mm->uprobes_state.xol_area; + if (!area) + return uprobe_warn(t, "dup xol area"); + + /* TODO: move it into the union in uprobe_task */ + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return uprobe_warn(t, "dup xol area"); + + utask->vaddr = area->vaddr; + init_task_work(work, dup_xol_work); + task_work_add(t, work, true); } /* -- cgit v1.1 From 3ab679661721b1ec2aaad99a801870ed59ab1110 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 16 Oct 2013 19:39:37 +0200 Subject: uprobes: Teach uprobe_copy_process() to handle CLONE_VFORK uprobe_copy_process() does nothing if the child shares ->mm with the forking process, but there is a special case: CLONE_VFORK. In this case it would be more correct to do dup_utask() but avoid dup_xol(). This is not that important, the child should not unwind its stack too much, this can corrupt the parent's stack, but at least we need this to allow to ret-probe __vfork() itself. Note: in theory, it would be better to check task_pt_regs(p)->sp instead of CLONE_VFORK, we need to dup_utask() if and only if the child can return from the function called by the parent. But this needs the arch-dependant helper, and I think that nobody actually does clone(same_stack, CLONE_VM). Reported-by: Martin Cermak Reported-by: David Smith Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9f282e1..ae9e1d2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1415,7 +1415,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t) +void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; @@ -1424,7 +1424,10 @@ void uprobe_copy_process(struct task_struct *t) t->utask = NULL; - if (mm == t->mm || !utask || !utask->return_instances) + if (!utask || !utask->return_instances) + return; + + if (mm == t->mm && !(flags & CLONE_VFORK)) return; if (dup_utask(t, utask)) @@ -1435,6 +1438,9 @@ void uprobe_copy_process(struct task_struct *t) if (!area) return uprobe_warn(t, "dup xol area"); + if (mm == t->mm) + return; + /* TODO: move it into the union in uprobe_task */ work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) -- cgit v1.1 From 26c86da8821f7b64fced498674990318bc34c8de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 10:19:59 +0100 Subject: perf: Simplify the ring-buffer code By using CIRC_SPACE() we can obviate the need for perf_output_space(). Shrinks the size of perf_output_begin() by 17 bytes on x86_64-defconfig. Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: "Paul E. McKenney" Cc: james.hogan@imgtec.com Cc: Vince Weaver Cc: Victor Kaplansky Cc: Oleg Nesterov Cc: Anton Blanchard Link: http://lkml.kernel.org/n/tip-vtb0xb0llebmsdlfn1v5vtfj@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 37 ++++--------------------------------- 1 file changed, 4 insertions(+), 33 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 9c2ddfb..6929c58 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -12,40 +12,10 @@ #include #include #include +#include #include "internal.h" -static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long sz = perf_data_size(rb); - unsigned long mask = sz - 1; - - /* - * check if user-writable - * overwrite : over-write its own tail - * !overwrite: buffer possibly drops events. - */ - if (rb->overwrite) - return true; - - /* - * verify that payload is not bigger than buffer - * otherwise masking logic may fail to detect - * the "not enough space" condition - */ - if ((head - offset) > sz) - return false; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLL_IN); @@ -181,9 +151,10 @@ int perf_output_begin(struct perf_output_handle *handle, tail = ACCESS_ONCE(rb->user_page->data_tail); smp_mb(); offset = head = local_read(&rb->head); - head += size; - if (unlikely(!perf_output_space(rb, tail, offset, head))) + if (!rb->overwrite && + unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) goto fail; + head += size; } while (local_cmpxchg(&rb->head, offset, head) != offset); if (head - local_read(&rb->wakeup) > rb->watermark) -- cgit v1.1 From c72b42a3dde487132da80202756c101b371b2add Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 17:20:25 +0100 Subject: perf: Add unlikely() to the ring-buffer code Add unlikely() annotations to 'slow' paths: When having a sampling event but no output buffer; you have bigger issues -- also the bail is still faster than actually doing the work. When having a sampling event but a control page only buffer, you have bigger issues -- again the bail is still faster than actually doing work. Optimize for the case where you're not loosing events -- again, not doing the work is still faster but make sure that when you have to actually do work its as fast as possible. The typical watermark is 1/2 the buffer size, so most events will not take this path. Shrinks perf_output_begin() by 16 bytes on x86_64-defconfig. Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: "Paul E. McKenney" Cc: james.hogan@imgtec.com Cc: Vince Weaver Cc: Victor Kaplansky Cc: Oleg Nesterov Cc: Anton Blanchard Link: http://lkml.kernel.org/n/tip-wlg3jew3qnutm8opd0hyeuwn@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6929c58..383cde4 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -121,17 +121,17 @@ int perf_output_begin(struct perf_output_handle *handle, event = event->parent; rb = rcu_dereference(event->rb); - if (!rb) + if (unlikely(!rb)) goto out; - handle->rb = rb; - handle->event = event; - - if (!rb->nr_pages) + if (unlikely(!rb->nr_pages)) goto out; + handle->rb = rb; + handle->event = event; + have_lost = local_read(&rb->lost); - if (have_lost) { + if (unlikely(have_lost)) { lost_event.header.size = sizeof(lost_event); perf_event_header__init_id(&lost_event.header, &sample_data, event); @@ -157,7 +157,7 @@ int perf_output_begin(struct perf_output_handle *handle, head += size; } while (local_cmpxchg(&rb->head, offset, head) != offset); - if (head - local_read(&rb->wakeup) > rb->watermark) + if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); handle->page = offset >> (PAGE_SHIFT + page_order(rb)); @@ -167,7 +167,7 @@ int perf_output_begin(struct perf_output_handle *handle, handle->addr += handle->size; handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; - if (have_lost) { + if (unlikely(have_lost)) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; -- cgit v1.1 From 85f59edf9684603026c64c902791748116d29478 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 17:25:38 +0100 Subject: perf: Optimize perf_output_begin() There's no point in re-doing the memory-barrier when we fail the cmpxchg(). Also placing it after the space reservation loop makes it clearer it only separates the userpage->tail read from the data stores. Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: "Paul E. McKenney" Cc: james.hogan@imgtec.com Cc: Vince Weaver Cc: Victor Kaplansky Cc: Oleg Nesterov Cc: Anton Blanchard Link: http://lkml.kernel.org/n/tip-c19u6egfldyx86tpyc3zgkw9@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 383cde4..6ed16ec 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -141,15 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle, perf_output_get_handle(handle); do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - * - * See perf_output_put_handle(). - */ tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_mb(); offset = head = local_read(&rb->head); if (!rb->overwrite && unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) @@ -157,6 +149,15 @@ int perf_output_begin(struct perf_output_handle *handle, head += size; } while (local_cmpxchg(&rb->head, offset, head) != offset); + /* + * Separate the userpage->tail read from the data stores below. + * Matches the MB userspace SHOULD issue after reading the data + * and before storing the new tail position. + * + * See perf_output_put_handle(). + */ + smp_mb(); + if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); -- cgit v1.1 From d20a973f46ed83e0d7d24f6c512064133038e193 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 17:29:29 +0100 Subject: perf: Optimize perf_output_begin() -- lost_event case Avoid touching the lost_event and sample_data cachelines twince. Its not like we end up doing less work, but it might help to keep all accesses to these cachelines in one place. Due to code shuffle, this looses 4 bytes on x86_64-defconfig. Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: "Paul E. McKenney" Cc: james.hogan@imgtec.com Cc: Vince Weaver Cc: Victor Kaplansky Cc: Oleg Nesterov Cc: Anton Blanchard Link: http://lkml.kernel.org/n/tip-zfxnc58qxj0eawdoj31hhupv@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6ed16ec..e4d70f3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -106,7 +106,6 @@ int perf_output_begin(struct perf_output_handle *handle, struct ring_buffer *rb; unsigned long tail, offset, head; int have_lost; - struct perf_sample_data sample_data; struct { struct perf_event_header header; u64 id; @@ -132,10 +131,9 @@ int perf_output_begin(struct perf_output_handle *handle, have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; + size += sizeof(lost_event); + if (event->attr.sample_id_all) + size += event->id_header_size; } perf_output_get_handle(handle); @@ -169,11 +167,16 @@ int perf_output_begin(struct perf_output_handle *handle, handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; if (unlikely(have_lost)) { + struct perf_sample_data sample_data; + + lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); + perf_event_header__init_id(&lost_event.header, + &sample_data, event); perf_output_put(handle, lost_event); perf_event__output_id_sample(event, handle, &sample_data); } -- cgit v1.1 From 524feca5e9da9e5f9e5aa5d5613b1d762db9509e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 17:36:25 +0100 Subject: perf: Optimize perf_output_begin() -- address calculation Rewrite the handle address calculation code to be clearer. Saves 8 bytes on x86_64-defconfig. Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: "Paul E. McKenney" Cc: james.hogan@imgtec.com Cc: Vince Weaver Cc: Victor Kaplansky Cc: Oleg Nesterov Cc: Anton Blanchard Link: http://lkml.kernel.org/n/tip-3trb2n2henb9m27tncef3ag7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index e4d70f3..c52a32f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -105,7 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle, { struct ring_buffer *rb; unsigned long tail, offset, head; - int have_lost; + int have_lost, page_shift; struct { struct perf_event_header header; u64 id; @@ -159,12 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle, if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(rb)); - handle->page &= rb->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); - handle->addr = rb->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; + page_shift = PAGE_SHIFT + page_order(rb); + + handle->page = (offset >> page_shift) & (rb->nr_pages - 1); + offset &= (1UL << page_shift) - 1; + handle->addr = rb->data_pages[handle->page] + offset; + handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { struct perf_sample_data sample_data; -- cgit v1.1 From 394570b7939e1262f39373866166d8ee0a506e88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 17:41:23 +0100 Subject: perf: Update a stale comment Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: "Paul E. McKenney" Cc: james.hogan@imgtec.com Cc: Vince Weaver Cc: Victor Kaplansky Cc: Oleg Nesterov Cc: Anton Blanchard Link: http://lkml.kernel.org/n/tip-9s5mze78gmlz19agt39i8rii@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c52a32f..e8b168a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -85,8 +85,8 @@ again: rb->user_page->data_head = head; /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read rb->head. + * Now check if we missed an update -- rely on previous implied + * compiler barriers to force a re-read. */ if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); -- cgit v1.1 From 0a196848ca365ec582c6d86659be456be6d4ed96 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Oct 2013 21:16:22 +0100 Subject: perf: Fix arch_perf_out_copy_user default The arch_perf_output_copy_user() default of __copy_from_user_inatomic() returns bytes not copied, while all other argument functions given DEFINE_OUTPUT_COPY() return bytes copied. Since copy_from_user_nmi() is the odd duck out by returning bytes copied where all other *copy_{to,from}* functions return bytes not copied, change it over and ammend DEFINE_OUTPUT_COPY() to expect bytes not copied. Oddly enough DEFINE_OUTPUT_COPY() already returned bytes not copied while expecting its worker functions to return bytes copied. Signed-off-by: Peter Zijlstra Acked-by: will.deacon@arm.com Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20131030201622.GR16117@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca65997..569b2187 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) } #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned int \ +static inline unsigned long \ func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned int len) \ + const void *buf, unsigned long len) \ { \ unsigned long size, written; \ \ do { \ - size = min_t(unsigned long, handle->size, len); \ - \ + size = min(handle->size, len); \ written = memcpy_func(handle->addr, buf, size); \ + written = size - written; \ \ len -= written; \ handle->addr += written; \ @@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \ return len; \ } -static inline int memcpy_common(void *dst, const void *src, size_t n) +static inline unsigned long +memcpy_common(void *dst, const void *src, unsigned long n) { memcpy(dst, src, n); - return n; + return 0; } DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) -#define MEMCPY_SKIP(dst, src, n) (n) +static inline unsigned long +memcpy_skip(void *dst, const void *src, unsigned long n) +{ + return 0; +} -DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) +DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) #ifndef arch_perf_out_copy_user -#define arch_perf_out_copy_user __copy_from_user_inatomic +#define arch_perf_out_copy_user arch_perf_out_copy_user + +static inline unsigned long +arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) +{ + unsigned long ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, n); + pagefault_enable(); + + return ret; +} #endif DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) -- cgit v1.1 From c7e548b45ce85f765f6262149dd60d9956a31d60 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 17 Oct 2013 20:24:17 +0200 Subject: perf: Factor out strncpy() in perf_event_mmap_event() While this is really minor, but strncpy() does the unnecessary zero-padding till the end of tmp[16] and it is called every time we are going to use the string literal. Turn these strncpy()'s into the single strlcpy() under the new label, saves 72 bytes. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131017182417.GA17753@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 17b3c6c..4dc078d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5144,8 +5144,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; + name = "//enomem"; + goto cpy_name; } /* * d_path() works from the end of the rb backwards, so we @@ -5154,8 +5154,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) */ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; + name = "//toolong"; + goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; @@ -5163,30 +5163,30 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); - + goto got_name; } else { name = (char *)arch_vma_name(vma); - if (name) { - name = strncpy(tmp, name, sizeof(tmp) - 1); - tmp[sizeof(tmp) - 1] = '\0'; - goto got_name; - } + if (name) + goto cpy_name; if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { - name = strncpy(tmp, "[heap]", sizeof(tmp)); - goto got_name; + name = "[heap]"; + goto cpy_name; } if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { - name = strncpy(tmp, "[stack]", sizeof(tmp)); - goto got_name; + name = "[stack]"; + goto cpy_name; } - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; + name = "//anon"; + goto cpy_name; } +cpy_name: + strlcpy(tmp, name, sizeof(tmp)); + name = tmp; got_name: /* * Since our buffer works in 8 byte units we need to align our string -- cgit v1.1 From 736e89d9f782a7dd9a38ecda13b2db916fa72f33 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Oct 2013 19:28:22 +0100 Subject: uprobes: Kill module_init() and module_exit() Turn module_init() into __initcall() and kill module_exit(). This code can't be compiled as a module so these module_*() calls only add the confusion, especially if arch-dependant code needs its own initialization hooks. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ae9e1d2..0012c8e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1941,9 +1941,4 @@ static int __init init_uprobes(void) return register_die_notifier(&uprobe_exception_nb); } -module_init(init_uprobes); - -static void __exit exit_uprobes(void) -{ -} -module_exit(exit_uprobes); +__initcall(init_uprobes); -- cgit v1.1 From 8a8de66c4f6ebd0f6d3da026ec24339aa5d1db12 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Nov 2013 20:27:13 +0100 Subject: uprobes: Introduce arch_uprobe->ixol Currently xol_get_insn_slot() assumes that we should simply copy arch_uprobe->insn[] which is (ignoring arch_uprobe_analyze_insn) just the copy of the original insn. This is not true for arm which needs to create another insn to execute it out-of-line. So this patch simply adds the new member, ->ixol into the union. This doesn't make any difference for x86 and powerpc, but arm can divorce insn/ixol and initialize the correct xol insn in arch_uprobe_analyze_insn(). Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0012c8e..fbcff61 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1264,7 +1264,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); + copy_to_page(area->page, xol_vaddr, + uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. -- cgit v1.1 From f72d41fa902fb19a9b63028202a400b0ce497491 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Nov 2013 19:50:39 +0100 Subject: uprobes: Export write_opcode() as uprobe_write_opcode() set_swbp() and set_orig_insn() are __weak, but this is pointless because write_opcode() is static. Export write_opcode() as uprobe_write_opcode() for the upcoming arm port, this way it can actually override set_swbp() and use __opcode_to_mem_arm(bpinsn) instead if UPROBE_SWBP_INSN. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index fbcff61..0ac346a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -245,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and - * write_opcode accordingly. This would never be a problem for archs that - * have fixed length instructions. + * uprobe_write_opcode accordingly. This would never be a problem for archs + * that have fixed length instructions. */ /* - * write_opcode - write the opcode at a given virtual address. + * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. @@ -261,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct mm_struct *mm, unsigned long vaddr, +int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; @@ -315,7 +315,7 @@ put_old: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -330,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -577,7 +577,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (ret) goto out; - /* write_opcode() assumes we don't cross page boundary */ + /* uprobe_write_opcode() assumes we don't cross page boundary */ BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); -- cgit v1.1 From 70d7f98722a7a1df1a55d6a92d0ce959c7aba9fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Nov 2013 16:35:55 +0100 Subject: uprobes: Fix the wrong usage of current->utask in uprobe_copy_process() Commit aa59c53fd459 "uprobes: Change uprobe_copy_process() to dup xol_area" has a stupid typo, we need to setup t->utask->vaddr but the code wrongly uses current->utask. Even with this bug dup_xol_work() works "in practice", but only because get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE) likely returns the same address every time. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0ac346a..5e56950 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1447,7 +1447,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) if (!work) return uprobe_warn(t, "dup xol area"); - utask->vaddr = area->vaddr; + t->utask->vaddr = area->vaddr; init_task_work(work, dup_xol_work); task_work_add(t, work, true); } -- cgit v1.1 From 2ded0980a6e4ae96bdd84bda66c7240967d86f3c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 Nov 2013 19:41:57 +0100 Subject: uprobes: Fix the memory out of bound overwrite in copy_insn() 1. copy_insn() doesn't look very nice, all calculations are confusing and it is not immediately clear why do we read the 2nd page first. 2. The usage of inode->i_size is wrong on 32-bit machines. 3. "Instruction at end of binary" logic is simply wrong, it doesn't handle the case when uprobe->offset > inode->i_size. In this case "bytes" overflows, and __copy_insn() writes to the memory outside of uprobe->arch.insn. Yes, uprobe_register() checks i_size_read(), but this file can be truncated after that. All i_size checks are racy, we do this only to catch the obvious mistakes. Change copy_insn() to call __copy_insn() in a loop, simplify and fix the bytes/nbytes calculations. Note: we do not care if we read extra bytes after inode->i_size if we got the valid page. This is fine because the task gets the same page after page-fault, and arch_uprobe_analyze_insn() can't know how many bytes were actually read anyway. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5e56950..24b7d6c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -504,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) return ret; } -static int -__copy_insn(struct address_space *mapping, struct file *filp, char *insn, - unsigned long nbytes, loff_t offset) +static int __copy_insn(struct address_space *mapping, struct file *filp, + void *insn, int nbytes, loff_t offset) { struct page *page; @@ -528,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, static int copy_insn(struct uprobe *uprobe, struct file *filp) { - struct address_space *mapping; - unsigned long nbytes; - int bytes; - - nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); - mapping = uprobe->inode->i_mapping; + struct address_space *mapping = uprobe->inode->i_mapping; + loff_t offs = uprobe->offset; + void *insn = uprobe->arch.insn; + int size = MAX_UINSN_BYTES; + int len, err = -EIO; - /* Instruction at end of binary; copy only available bytes */ - if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) - bytes = uprobe->inode->i_size - uprobe->offset; - else - bytes = MAX_UINSN_BYTES; + /* Copy only available bytes, -EIO if nothing was read */ + do { + if (offs >= i_size_read(uprobe->inode)) + break; - /* Instruction at the page-boundary; copy bytes in second page */ - if (nbytes < bytes) { - int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes); + len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); + err = __copy_insn(mapping, filp, insn, len, offs); if (err) - return err; - bytes = nbytes; - } - return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); + break; + + insn += len; + offs += len; + size -= len; + } while (size); + + return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, -- cgit v1.1