diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 13:22:43 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 13:22:43 -0700 |
commit | 4f0ac854167846bd55cd81dbc9a36e03708aa01c (patch) | |
tree | 0eb34d18a667f8e68ad9255f791560b028fed2a6 /arch | |
parent | b9356c53ba2f593081e5aa45eb67adcce243d1c0 (diff) | |
parent | 6b58e7f146f8d79c08f62087f928e1f01747de71 (diff) | |
download | op-kernel-dev-4f0ac854167846bd55cd81dbc9a36e03708aa01c.zip op-kernel-dev-4f0ac854167846bd55cd81dbc9a36e03708aa01c.tar.gz |
Merge branch 'perfcounters-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perfcounters-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (60 commits)
perf tools: Avoid unnecessary work in directory lookups
perf stat: Clean up statistics calculations a bit more
perf stat: More advanced variance computation
perf stat: Use stddev_mean in stead of stddev
perf stat: Remove the limit on repeat
perf stat: Change noise calculation to use stddev
x86, perf_counter, bts: Do not allow kernel BTS tracing for now
x86, perf_counter, bts: Correct pointer-to-u64 casts
x86, perf_counter, bts: Fail if BTS is not available
perf_counter: Fix output-sharing error path
perf trace: Fix read_string()
perf trace: Print out in nanoseconds
perf tools: Seek to the end of the header area
perf trace: Fix parsing of perf.data
perf trace: Sample timestamps as well
perf_counter: Introduce new (non-)paranoia level to allow raw tracepoint access
perf trace: Sample the CPU too
perf tools: Work around strict aliasing related warnings
perf tools: Clean up warnings list in the Makefile
perf tools: Complete support for dynamic strings
...
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/include/asm/pgtable.h | 6 | ||||
-rw-r--r-- | arch/powerpc/kernel/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/kernel/asm-offsets.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kernel/exceptions-64s.S | 19 | ||||
-rw-r--r-- | arch/powerpc/kernel/perf_callchain.c | 527 | ||||
-rw-r--r-- | arch/powerpc/mm/slb.c | 37 | ||||
-rw-r--r-- | arch/powerpc/mm/stab.c | 11 | ||||
-rw-r--r-- | arch/x86/include/asm/perf_counter.h | 10 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_counter.c | 329 |
9 files changed, 921 insertions, 22 deletions
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index eb17da7..2a5da06 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -104,8 +104,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, else pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte)); -#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP) - /* Second case is 32-bit with 64-bit PTE in SMP mode. In this case, we +#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) + /* Second case is 32-bit with 64-bit PTE. In this case, we * can just store as long as we do the two halves in the right order * with a barrier in between. This is possible because we take care, * in the hash code, to pre-invalidate if the PTE was already hashed, @@ -140,7 +140,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, #else /* Anything else just stores the PTE normally. That covers all 64-bit - * cases, and 32-bit non-hash with 64-bit PTEs in UP mode + * cases, and 32-bit non-hash with 32-bit PTEs. */ *ptep = pte; #endif diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b73396b..9619285 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o -obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o +obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ power5+-pmu.o power6-pmu.o power7-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 561b646..197b156 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -67,6 +67,8 @@ int main(void) DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id)); #ifdef CONFIG_PPC64 DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context)); + DEFINE(SIGSEGV, SIGSEGV); + DEFINE(NMI_MASK, NMI_MASK); #else DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index eb89811..8ac85e0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -729,6 +729,11 @@ BEGIN_FTR_SECTION bne- do_ste_alloc /* If so handle it */ END_FTR_SECTION_IFCLR(CPU_FTR_SLB) + clrrdi r11,r1,THREAD_SHIFT + lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ + andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ + bne 77f /* then don't call hash_page now */ + /* * On iSeries, we soft-disable interrupts here, then * hard-enable interrupts so that the hash_page code can spin on @@ -833,6 +838,20 @@ handle_page_fault: bl .low_hash_fault b .ret_from_except +/* + * We come here as a result of a DSI at a point where we don't want + * to call hash_page, such as when we are accessing memory (possibly + * user memory) inside a PMU interrupt that occurred while interrupts + * were soft-disabled. We want to invoke the exception handler for + * the access, or panic if there isn't a handler. + */ +77: bl .save_nvgprs + mr r4,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + li r5,SIGSEGV + bl .bad_page_fault + b .ret_from_except + /* here we have a segment miss */ do_ste_alloc: bl .ste_allocate /* try to insert stab entry */ diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c new file mode 100644 index 0000000..f74b62c --- /dev/null +++ b/arch/powerpc/kernel/perf_callchain.c @@ -0,0 +1,527 @@ +/* + * Performance counter callchain support - powerpc architecture code + * + * Copyright © 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/perf_counter.h> +#include <linux/percpu.h> +#include <linux/uaccess.h> +#include <linux/mm.h> +#include <asm/ptrace.h> +#include <asm/pgtable.h> +#include <asm/sigcontext.h> +#include <asm/ucontext.h> +#include <asm/vdso.h> +#ifdef CONFIG_PPC64 +#include "ppc32.h" +#endif + +/* + * Store another value in a callchain_entry. + */ +static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ + unsigned int nr = entry->nr; + + if (nr < PERF_MAX_STACK_DEPTH) { + entry->ip[nr] = ip; + entry->nr = nr + 1; + } +} + +/* + * Is sp valid as the address of the next kernel stack frame after prev_sp? + * The next frame may be in a different stack area but should not go + * back down in the same stack area. + */ +static int valid_next_sp(unsigned long sp, unsigned long prev_sp) +{ + if (sp & 0xf) + return 0; /* must be 16-byte aligned */ + if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) + return 0; + if (sp >= prev_sp + STACK_FRAME_OVERHEAD) + return 1; + /* + * sp could decrease when we jump off an interrupt stack + * back to the regular process stack. + */ + if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1))) + return 1; + return 0; +} + +static void perf_callchain_kernel(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ + unsigned long sp, next_sp; + unsigned long next_ip; + unsigned long lr; + long level = 0; + unsigned long *fp; + + lr = regs->link; + sp = regs->gpr[1]; + callchain_store(entry, PERF_CONTEXT_KERNEL); + callchain_store(entry, regs->nip); + + if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) + return; + + for (;;) { + fp = (unsigned long *) sp; + next_sp = fp[0]; + + if (next_sp == sp + STACK_INT_FRAME_SIZE && + fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + /* + * This looks like an interrupt frame for an + * interrupt that occurred in the kernel + */ + regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD); + next_ip = regs->nip; + lr = regs->link; + level = 0; + callchain_store(entry, PERF_CONTEXT_KERNEL); + + } else { + if (level == 0) + next_ip = lr; + else + next_ip = fp[STACK_FRAME_LR_SAVE]; + + /* + * We can't tell which of the first two addresses + * we get are valid, but we can filter out the + * obviously bogus ones here. We replace them + * with 0 rather than removing them entirely so + * that userspace can tell which is which. + */ + if ((level == 1 && next_ip == lr) || + (level <= 1 && !kernel_text_address(next_ip))) + next_ip = 0; + + ++level; + } + + callchain_store(entry, next_ip); + if (!valid_next_sp(next_sp, sp)) + return; + sp = next_sp; + } +} + +#ifdef CONFIG_PPC64 + +#ifdef CONFIG_HUGETLB_PAGE +#define is_huge_psize(pagesize) (HPAGE_SHIFT && mmu_huge_psizes[pagesize]) +#else +#define is_huge_psize(pagesize) 0 +#endif + +/* + * On 64-bit we don't want to invoke hash_page on user addresses from + * interrupt context, so if the access faults, we read the page tables + * to find which page (if any) is mapped and access it directly. + */ +static int read_user_stack_slow(void __user *ptr, void *ret, int nb) +{ + pgd_t *pgdir; + pte_t *ptep, pte; + int pagesize; + unsigned long addr = (unsigned long) ptr; + unsigned long offset; + unsigned long pfn; + void *kaddr; + + pgdir = current->mm->pgd; + if (!pgdir) + return -EFAULT; + + pagesize = get_slice_psize(current->mm, addr); + + /* align address to page boundary */ + offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1); + addr -= offset; + + if (is_huge_psize(pagesize)) + ptep = huge_pte_offset(current->mm, addr); + else + ptep = find_linux_pte(pgdir, addr); + + if (ptep == NULL) + return -EFAULT; + pte = *ptep; + if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) + return -EFAULT; + pfn = pte_pfn(pte); + if (!page_is_ram(pfn)) + return -EFAULT; + + /* no highmem to worry about here */ + kaddr = pfn_to_kaddr(pfn); + memcpy(ret, kaddr + offset, nb); + return 0; +} + +static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) +{ + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) || + ((unsigned long)ptr & 7)) + return -EFAULT; + + if (!__get_user_inatomic(*ret, ptr)) + return 0; + + return read_user_stack_slow(ptr, ret, 8); +} + +static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +{ + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || + ((unsigned long)ptr & 3)) + return -EFAULT; + + if (!__get_user_inatomic(*ret, ptr)) + return 0; + + return read_user_stack_slow(ptr, ret, 4); +} + +static inline int valid_user_sp(unsigned long sp, int is_64) +{ + if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32) + return 0; + return 1; +} + +/* + * 64-bit user processes use the same stack frame for RT and non-RT signals. + */ +struct signal_frame_64 { + char dummy[__SIGNAL_FRAMESIZE]; + struct ucontext uc; + unsigned long unused[2]; + unsigned int tramp[6]; + struct siginfo *pinfo; + void *puc; + struct siginfo info; + char abigap[288]; +}; + +static int is_sigreturn_64_address(unsigned long nip, unsigned long fp) +{ + if (nip == fp + offsetof(struct signal_frame_64, tramp)) + return 1; + if (vdso64_rt_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso64_rt_sigtramp) + return 1; + return 0; +} + +/* + * Do some sanity checking on the signal frame pointed to by sp. + * We check the pinfo and puc pointers in the frame. + */ +static int sane_signal_64_frame(unsigned long sp) +{ + struct signal_frame_64 __user *sf; + unsigned long pinfo, puc; + + sf = (struct signal_frame_64 __user *) sp; + if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) || + read_user_stack_64((unsigned long __user *) &sf->puc, &puc)) + return 0; + return pinfo == (unsigned long) &sf->info && + puc == (unsigned long) &sf->uc; +} + +static void perf_callchain_user_64(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ + unsigned long sp, next_sp; + unsigned long next_ip; + unsigned long lr; + long level = 0; + struct signal_frame_64 __user *sigframe; + unsigned long __user *fp, *uregs; + + next_ip = regs->nip; + lr = regs->link; + sp = regs->gpr[1]; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + + for (;;) { + fp = (unsigned long __user *) sp; + if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) + return; + if (level > 0 && read_user_stack_64(&fp[2], &next_ip)) + return; + + /* + * Note: the next_sp - sp >= signal frame size check + * is true when next_sp < sp, which can happen when + * transitioning from an alternate signal stack to the + * normal stack. + */ + if (next_sp - sp >= sizeof(struct signal_frame_64) && + (is_sigreturn_64_address(next_ip, sp) || + (level <= 1 && is_sigreturn_64_address(lr, sp))) && + sane_signal_64_frame(sp)) { + /* + * This looks like an signal frame + */ + sigframe = (struct signal_frame_64 __user *) sp; + uregs = sigframe->uc.uc_mcontext.gp_regs; + if (read_user_stack_64(&uregs[PT_NIP], &next_ip) || + read_user_stack_64(&uregs[PT_LNK], &lr) || + read_user_stack_64(&uregs[PT_R1], &sp)) + return; + level = 0; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + continue; + } + + if (level == 0) + next_ip = lr; + callchain_store(entry, next_ip); + ++level; + sp = next_sp; + } +} + +static inline int current_is_64bit(void) +{ + /* + * We can't use test_thread_flag() here because we may be on an + * interrupt stack, and the thread flags don't get copied over + * from the thread_info on the main stack to the interrupt stack. + */ + return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT); +} + +#else /* CONFIG_PPC64 */ +/* + * On 32-bit we just access the address and let hash_page create a + * HPTE if necessary, so there is no need to fall back to reading + * the page tables. Since this is called at interrupt level, + * do_page_fault() won't treat a DSI as a page fault. + */ +static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +{ + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || + ((unsigned long)ptr & 3)) + return -EFAULT; + + return __get_user_inatomic(*ret, ptr); +} + +static inline void perf_callchain_user_64(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ +} + +static inline int current_is_64bit(void) +{ + return 0; +} + +static inline int valid_user_sp(unsigned long sp, int is_64) +{ + if (!sp || (sp & 7) || sp > TASK_SIZE - 32) + return 0; + return 1; +} + +#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE +#define sigcontext32 sigcontext +#define mcontext32 mcontext +#define ucontext32 ucontext +#define compat_siginfo_t struct siginfo + +#endif /* CONFIG_PPC64 */ + +/* + * Layout for non-RT signal frames + */ +struct signal_frame_32 { + char dummy[__SIGNAL_FRAMESIZE32]; + struct sigcontext32 sctx; + struct mcontext32 mctx; + int abigap[56]; +}; + +/* + * Layout for RT signal frames + */ +struct rt_signal_frame_32 { + char dummy[__SIGNAL_FRAMESIZE32 + 16]; + compat_siginfo_t info; + struct ucontext32 uc; + int abigap[56]; +}; + +static int is_sigreturn_32_address(unsigned int nip, unsigned int fp) +{ + if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad)) + return 1; + if (vdso32_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso32_sigtramp) + return 1; + return 0; +} + +static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp) +{ + if (nip == fp + offsetof(struct rt_signal_frame_32, + uc.uc_mcontext.mc_pad)) + return 1; + if (vdso32_rt_sigtramp && current->mm->context.vdso_base && + nip == current->mm->context.vdso_base + vdso32_rt_sigtramp) + return 1; + return 0; +} + +static int sane_signal_32_frame(unsigned int sp) +{ + struct signal_frame_32 __user *sf; + unsigned int regs; + + sf = (struct signal_frame_32 __user *) (unsigned long) sp; + if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, ®s)) + return 0; + return regs == (unsigned long) &sf->mctx; +} + +static int sane_rt_signal_32_frame(unsigned int sp) +{ + struct rt_signal_frame_32 __user *sf; + unsigned int regs; + + sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; + if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, ®s)) + return 0; + return regs == (unsigned long) &sf->uc.uc_mcontext; +} + +static unsigned int __user *signal_frame_32_regs(unsigned int sp, + unsigned int next_sp, unsigned int next_ip) +{ + struct mcontext32 __user *mctx = NULL; + struct signal_frame_32 __user *sf; + struct rt_signal_frame_32 __user *rt_sf; + + /* + * Note: the next_sp - sp >= signal frame size check + * is true when next_sp < sp, for example, when + * transitioning from an alternate signal stack to the + * normal stack. + */ + if (next_sp - sp >= sizeof(struct signal_frame_32) && + is_sigreturn_32_address(next_ip, sp) && + sane_signal_32_frame(sp)) { + sf = (struct signal_frame_32 __user *) (unsigned long) sp; + mctx = &sf->mctx; + } + + if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) && + is_rt_sigreturn_32_address(next_ip, sp) && + sane_rt_signal_32_frame(sp)) { + rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp; + mctx = &rt_sf->uc.uc_mcontext; + } + + if (!mctx) + return NULL; + return mctx->mc_gregs; +} + +static void perf_callchain_user_32(struct pt_regs *regs, + struct perf_callchain_entry *entry) +{ + unsigned int sp, next_sp; + unsigned int next_ip; + unsigned int lr; + long level = 0; + unsigned int __user *fp, *uregs; + + next_ip = regs->nip; + lr = regs->link; + sp = regs->gpr[1]; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + + while (entry->nr < PERF_MAX_STACK_DEPTH) { + fp = (unsigned int __user *) (unsigned long) sp; + if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) + return; + if (level > 0 && read_user_stack_32(&fp[1], &next_ip)) + return; + + uregs = signal_frame_32_regs(sp, next_sp, next_ip); + if (!uregs && level <= 1) + uregs = signal_frame_32_regs(sp, next_sp, lr); + if (uregs) { + /* + * This looks like an signal frame, so restart + * the stack trace with the values in it. + */ + if (read_user_stack_32(&uregs[PT_NIP], &next_ip) || + read_user_stack_32(&uregs[PT_LNK], &lr) || + read_user_stack_32(&uregs[PT_R1], &sp)) + return; + level = 0; + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, next_ip); + continue; + } + + if (level == 0) + next_ip = lr; + callchain_store(entry, next_ip); + ++level; + sp = next_sp; + } +} + +/* + * Since we can't get PMU interrupts inside a PMU interrupt handler, + * we don't need separate irq and nmi entries here. + */ +static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry = &__get_cpu_var(callchain); + + entry->nr = 0; + + if (current->pid == 0) /* idle task? */ + return entry; + + if (!user_mode(regs)) { + perf_callchain_kernel(regs, entry); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + if (current_is_64bit()) + perf_callchain_user_64(regs, entry); + else + perf_callchain_user_32(regs, entry); + } + + return entry; +} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5b7038f..a685652 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize, : "memory" ); } -void slb_flush_and_rebolt(void) +static void __slb_flush_and_rebolt(void) { /* If you change this make sure you change SLB_NUM_BOLTED * appropriately too. */ unsigned long linear_llp, vmalloc_llp, lflags, vflags; unsigned long ksp_esid_data, ksp_vsid_data; - WARN_ON(!irqs_disabled()); - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; lflags = SLB_VSID_KERNEL | linear_llp; @@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void) ksp_vsid_data = get_slb_shadow()->save_area[2].vsid; } - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - /* We need to do this all in asm, so we're sure we don't touch * the stack between the slbia and rebolting it. */ asm volatile("isync\n" @@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void) : "memory"); } +void slb_flush_and_rebolt(void) +{ + + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + __slb_flush_and_rebolt(); + get_paca()->slb_cache_ptr = 0; +} + void slb_vmalloc_update(void) { unsigned long vflags; @@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2) /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long offset = get_paca()->slb_cache_ptr; + unsigned long offset; unsigned long slbie_data = 0; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); unsigned long unmapped_base; + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an SLB miss + * which would update the slb_cache/slb_cache_ptr fields in the PACA. + */ + hard_irq_disable(); + offset = get_paca()->slb_cache_ptr; if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { int i; @@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) } asm volatile("isync" : : : "memory"); } else { - slb_flush_and_rebolt(); + __slb_flush_and_rebolt(); } /* Workaround POWER5 < DD2.1 issue */ diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index 98cd1dc..ab5fb48 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) { struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; struct stab_entry *ste; - unsigned long offset = __get_cpu_var(stab_cache_ptr); + unsigned long offset; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); unsigned long unmapped_base; @@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) /* Force previous translations to complete. DRENG */ asm volatile("isync" : : : "memory"); + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an STAB miss + * which would update the stab_cache/stab_cache_ptr per-cpu variables. + */ + hard_irq_disable(); + + offset = __get_cpu_var(stab_cache_ptr); if (offset <= NR_STAB_CACHE_ENTRIES) { int i; diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index fa64e40..e7b7c93 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,6 +84,16 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed counter range, since lower + * values are used by actual fixed counters and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 900332b..f9cd084 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -6,6 +6,7 @@ * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * * For licencing details see kernel-base/COPYING */ @@ -20,6 +21,7 @@ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/highmem.h> +#include <linux/cpu.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -27,12 +29,52 @@ static u64 perf_counter_mask __read_mostly; +/* The maximal number of PEBS counters: */ +#define MAX_PEBS_COUNTERS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; +}; + struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; + struct debug_store *ds; }; /* @@ -58,6 +100,8 @@ struct x86_pmu { int apic; u64 max_period; u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); }; static struct x86_pmu x86_pmu __read_mostly; @@ -577,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter, u64 prev_raw_count, new_raw_count; s64 delta; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * Careful: an NMI might modify the previous counter value. * @@ -666,10 +713,110 @@ static void release_pmc_hardware(void) #endif } +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_counters, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_counters, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_counters, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + static void hw_perf_counter_destroy(struct perf_counter *counter) { if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); + release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -712,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) return 0; } +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + /* * Setup the hardware configuration for a given attr_type */ @@ -728,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter) err = 0; if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) - err = -EBUSY; - else + if (atomic_read(&active_counters) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + err = reserve_bts_hardware(); + } + if (!err) atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } @@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (config == -1LL) return -EINVAL; + /* + * Branch tracing: + */ + if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && + (hwc->sample_period == 1)) { + /* BTS is not supported by this architecture. */ + if (!bts_available()) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + return -EOPNOTSUPP; + } + hwc->config |= config; return 0; @@ -817,7 +1018,18 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); } static void amd_pmu_disable_all(void) @@ -875,7 +1087,25 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_counter *counter = + cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!counter)) + return; + + intel_pmu_enable_bts(counter->hw.config); + } } static void amd_pmu_enable_all(void) @@ -962,6 +1192,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) static inline void intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc, idx); return; @@ -990,6 +1225,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, s64 period = hwc->sample_period; int err, ret = 0; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * If we are way outside a reasoable range then just skip forward: */ @@ -1072,6 +1310,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_counters).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc, idx); return; @@ -1093,11 +1339,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + if (!x86_pmu.num_counters_fixed) return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) @@ -1118,7 +1369,15 @@ static int x86_pmu_enable(struct perf_counter *counter) int idx; idx = fixed_mode_idx(counter, hwc); - if (idx >= 0) { + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* BTS is already occupied. */ + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + hwc->config_base = 0; + hwc->counter_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { /* * Try to get the fixed counter, if that is already taken * then try to get a generic counter: @@ -1229,6 +1488,44 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, + struct perf_sample_data *data) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + unsigned long orig_ip = data->regs->ip; + struct bts_record *at, *top; + + if (!counter) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + ds->bts_index = ds->bts_buffer_base; + + for (; at < top; at++) { + data->regs->ip = at->from; + data->addr = at->to; + + perf_counter_output(counter, 1, data); + } + + data->regs->ip = orig_ip; + data->addr = 0; + + /* There's new data available. */ + counter->pending_kill = POLL_IN; +} + static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1253,6 +1550,15 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct perf_sample_data data; + struct pt_regs regs; + + data.regs = ®s; + intel_pmu_drain_bts_buffer(cpuc, &data); + } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); @@ -1280,6 +1586,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter) static void intel_pmu_reset(void) { + struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; unsigned long flags; int idx; @@ -1297,6 +1604,8 @@ static void intel_pmu_reset(void) for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } + if (ds) + ds->bts_index = ds->bts_buffer_base; local_irq_restore(flags); } @@ -1362,6 +1671,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); + intel_pmu_drain_bts_buffer(cpuc, &data); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1571,6 +1881,8 @@ static struct x86_pmu intel_pmu = { * the generic counter period: */ .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, }; static struct x86_pmu amd_pmu = { @@ -1962,3 +2274,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void hw_perf_counter_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} |