From e40b17208b6805be50ffe891878662b6076206b9 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:03 -0500 Subject: x86: Move notify_die from nmi.c to traps.c In order to handle a new nmi_watchdog approach, I need to move the notify_die() routine out of nmi_watchdog_tick() and into default_do_nmi(). This lets me easily swap out the old nmi_watchdog with the new one with just a config change. The change probably makes sense from a high level perspective because the nmi_watchdog shouldn't be handling notify_die routines anyway. However, this move does change the semantics a little bit. Instead of checking on every nmi interrupt if the cpus are stuck, only check them on the nmi_watchdog interrupts. v2: Move notify_die call into #idef block Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 7 ------- arch/x86/kernel/traps.c | 5 +++++ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69..5d47682 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -400,13 +400,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) int cpu = smp_processor_id(); int rc = 0; - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - sum = get_timer_irqs(cpu); if (__get_cpu_var(nmi_touch)) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1168e44..51ef893f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -400,7 +400,12 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + #ifdef CONFIG_X86_LOCAL_APIC + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; + /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. -- cgit v1.1 From 1fb9d6ad2766a1dd70d167552988375049a97f21 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:04 -0500 Subject: nmi_watchdog: Add new, generic implementation, using perf events This is a new generic nmi_watchdog implementation using the perf events infrastructure as suggested by Ingo. The implementation is simple, just create an in-kernel perf event and register an overflow handler to check for cpu lockups. I created a generic implementation that lives in kernel/ and the hardware specific part that for now lives in arch/x86. This approach has a number of advantages: - It simplifies the x86 PMU implementation in the long run, in that it removes the hardcoded low-level PMU implementation that was the NMI watchdog before. - It allows new NMI watchdog features to be added in a central place. - It allows other architectures to enable the NMI watchdog, as long as they have perf events (that provide NMIs) implemented. - It also allows for more graceful co-existence of existing perf events apps and the NMI watchdog - before these changes the relationship was exclusive. (The NMI watchdog will 'spend' a perf event when enabled. In later iterations we might be able to piggyback from an existing NMI event without having to allocate a hardware event for the NMI watchdog - turning this into a no-hardware-cost feature.) As for compatibility, we'll keep the old NMI watchdog code as well until the new one can 100% replace it on all CPUs, old and new alike. That might take some time as the NMI watchdog has been ported to many CPU models. I have done light testing to make sure the framework works correctly and it does. v2: Set the correct timeout values based on the old nmi watchdog Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 114 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 arch/x86/kernel/apic/hw_nmi.c (limited to 'arch') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 0000000..8c0e6a4 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -0,0 +1,114 @@ +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +static DEFINE_PER_CPU(unsigned, last_irq_sum); + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +} + +static inline int mce_in_progress(void) +{ +#if defined(CONFIG_X86_MCE) + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +int hw_nmi_is_cpu_stuck(struct pt_regs *regs) +{ + unsigned int sum; + int cpu = smp_processor_id(); + + /* FIXME: cheap hack for this check, probably should get its own + * die_notifier handler + */ + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + } + + /* if we are doing an mce, just assume the cpu is not stuck */ + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; + + /* We determine if the cpu is stuck by checking whether any + * interrupts have happened since we last checked. Of course + * an nmi storm could create false positives, but the higher + * level logic should account for that + */ + sum = get_timer_irqs(cpu); + if (__get_cpu_var(last_irq_sum) == sum) { + return 1; + } else { + __get_cpu_var(last_irq_sum) = sum; + return 0; + } +} + +void arch_trigger_all_cpu_backtrace(void) +{ + int i; + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + } +} + +/* STUB calls to mimic old nmi_watchdog behaviour */ +unsigned int nmi_watchdog = NMI_NONE; +EXPORT_SYMBOL(nmi_watchdog); +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +EXPORT_SYMBOL(nmi_active); +int nmi_watchdog_enabled; +int unknown_nmi_panic; +void cpu_nmi_set_wd_enabled(void) { return; } +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +void stop_apic_nmi_watchdog(void *unused) { return; } +void setup_apic_nmi_watchdog(void *unused) { return; } +int __init check_nmi_watchdog(void) { return 0; } -- cgit v1.1 From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:05 -0500 Subject: nmi_watchdog: Config option to enable new nmi_watchdog These are the bits that enable the new nmi_watchdog and safely isolate the old nmi_watchdog. Only one or the other can run, not both at the same time. Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/Makefile | 7 ++++++- arch/x86/kernel/traps.c | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bf..1a4512e 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,12 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +ifneq ($(CONFIG_NMI_WATCHDOG),y) +obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o +endif +obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o + obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 51ef893f..973cbc4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; +#ifndef CONFIG_NMI_WATCHDOG /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -413,6 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) +#endif /* !CONFIG_NMI_WATCHDOG */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); -- cgit v1.1 From c3128fb6ad39b0edda6675d20585a64846cf89ea Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:18 -0500 Subject: nmi_watchdog: Use a boolean config flag for compiling Determines if an arch has setup arch specific perf_events and nmi_watchdog code. This should restrict compiles to only those arches ready. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbcbfde..4f9685f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -52,6 +52,7 @@ config X86 select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT select PERF_EVENTS + select PERF_EVENTS_NMI select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER -- cgit v1.1 From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:19 -0500 Subject: nmi_watchdog: Compile and portability fixes The original patch was x86_64 centric. Changed the code to make it less so. ested by building and running on a powerpc. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 93da9c3..5b41b0f 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); +#if !defined(CONFIG_NMI_WATCHDOG) extern int nmi_watchdog_enabled; +#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c0e6a4..312d772 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum); */ static inline unsigned int get_timer_irqs(int cpu) { - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; + unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; + +#if defined(CONFIG_X86_LOCAL_APIC) + irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; +#endif + + return irqs; } static inline int mce_in_progress(void) @@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } } +u64 hw_nmi_get_sample_period(void) +{ + return cpu_khz * 1000; +} + void arch_trigger_all_cpu_backtrace(void) { int i; @@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void) } /* STUB calls to mimic old nmi_watchdog behaviour */ +#if defined(CONFIG_X86_LOCAL_APIC) unsigned int nmi_watchdog = NMI_NONE; EXPORT_SYMBOL(nmi_watchdog); +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +#endif atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); -int nmi_watchdog_enabled; int unknown_nmi_panic; void cpu_nmi_set_wd_enabled(void) { return; } -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } void stop_apic_nmi_watchdog(void *unused) { return; } void setup_apic_nmi_watchdog(void *unused) { return; } int __init check_nmi_watchdog(void) { return 0; } -- cgit v1.1 From 2cc4452bc31fc1cde6f0b64a4eb13269f982787d Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 18 Feb 2010 21:56:52 -0500 Subject: nmi_watchdog: Fix undefined 'apic' build bug Ingo provided me a config that fails to compile with: arch/x86/built-in.o: In function `arch_trigger_all_cpu_backtrace': (.text+0x17e78): undefined reference to `apic' make: *** [.tmp_vmlinux1] Error 1 I realized I changed the compile behaviour of the nmi code by not wrapping it with CONFIG_LOCAL_APIC. To fix this I add a compile check for ARCH_HAS_NMI_WATCHDOG around arch_trigger_all_cpu_backtrace. Signed-off-by: Don Zickus Cc: a.p.zijlstra@chello.nl Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266548212-24243-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 312d772..0b4d205 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -92,6 +92,7 @@ u64 hw_nmi_get_sample_period(void) return cpu_khz * 1000; } +#ifdef ARCH_HAS_NMI_WATCHDOG void arch_trigger_all_cpu_backtrace(void) { int i; @@ -108,6 +109,7 @@ void arch_trigger_all_cpu_backtrace(void) mdelay(1); } } +#endif /* STUB calls to mimic old nmi_watchdog behaviour */ #if defined(CONFIG_X86_LOCAL_APIC) -- cgit v1.1 From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 22 Feb 2010 18:09:03 -0500 Subject: nmi_watchdog: Clean up various small details Mostly copy/paste whitespace damage with a couple of nitpicks by the checkpatch script. Fix the struct definition as requested by Ingo too. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/apic/hw_nmi.c | 14 +++++------ arch/x86/kernel/traps.c | 6 ++-- include/linux/nmi.h | 2 - kernel/nmi_watchdog.c | 51 ++++++++++++++++++++---------------------- 4 files changed, 36 insertions(+), 37 deletions(-) --- arch/x86/kernel/apic/hw_nmi.c | 14 +++++++------- arch/x86/kernel/traps.c | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 0b4d205..e8b78a0 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu) irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; #endif - return irqs; + return irqs; } static inline int mce_in_progress(void) { #if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; + return atomic_read(&mce_entry) > 0; #endif - return 0; + return 0; } int hw_nmi_is_cpu_stuck(struct pt_regs *regs) @@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; /* We determine if the cpu is stuck by checking whether any * interrupts have happened since we last checked. Of course @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return cpu_khz * 1000; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 973cbc4..bdc7fab 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) return; #ifdef CONFIG_X86_LOCAL_APIC - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; #ifndef CONFIG_NMI_WATCHDOG /* -- cgit v1.1 From 45c34e05c4e3d36e7c44e790241ea11a1d90d54e Mon Sep 17 00:00:00 2001 From: John Villalovos Date: Fri, 7 May 2010 12:41:40 -0400 Subject: Oprofile: Change CPUIDS from decimal to hex, and add some comments Back when the patch was submitted for "Add Xeon 7500 series support to oprofile", Robert Richter had asked for a followon patch that converted all the CPU ID values to hex. I have done that here for the "i386/core_i7" and "i386/atom" class processors in the ppro_init() function and also added some comments on where to find documentation on the Intel processors. Signed-off-by: John L. Villalovos Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b28d2f1..1ba67dc 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -634,6 +634,18 @@ static int __init ppro_init(char **cpu_type) if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; + /* + * Documentation on identifying Intel processors by CPU family + * and model can be found in the Intel Software Developer's + * Manuals (SDM): + * + * http://www.intel.com/products/processor/manuals/ + * + * As of May 2010 the documentation for this was in the: + * "Intel 64 and IA-32 Architectures Software Developer's + * Manual Volume 3B: System Programming Guide", "Table B-1 + * CPUID Signature Values of DisplayFamily_DisplayModel". + */ switch (cpu_model) { case 0 ... 2: *cpu_type = "i386/ppro"; @@ -655,12 +667,12 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 0x1a: case 0x2e: - case 26: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; - case 28: + case 0x1c: *cpu_type = "i386/atom"; break; default: -- cgit v1.1 From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:44 -0400 Subject: lockup_detector: Combine nmi_watchdog and softlockup detector The new nmi_watchdog (which uses the perf event subsystem) is very similar in structure to the softlockup detector. Using Ingo's suggestion, I combined the two functionalities into one file: kernel/watchdog.c. Now both the nmi_watchdog (or hardlockup detector) and softlockup detector sit on top of the perf event subsystem, which is run every 60 seconds or so to see if there are any lockups. To detect hardlockups, cpus not responding to interrupts, I implemented an hrtimer that runs 5 times for every perf event overflow event. If that stops counting on a cpu, then the cpu is most likely in trouble. To detect softlockups, tasks not yielding to the scheduler, I used the previous kthread idea that now gets kicked every time the hrtimer fires. If the kthread isn't being scheduled neither is anyone else and the warning is printed to the console. I tested this on x86_64 and both the softlockup and hardlockup paths work. V2: - cleaned up the Kconfig and softlockup combination - surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI - seperated out the softlockup case from perf event subsystem - re-arranged the enabling/disabling nmi watchdog from proc space - added cpumasks for hardlockup failure cases - removed fallback to soft events if no PMU exists for hard events V3: - comment cleanups - drop support for older softlockup code - per_cpu cleanups - completely remove software clock base hardlockup detector - use per_cpu masking on hard/soft lockup detection - #ifdef cleanups - rename config option NMI_WATCHDOG to LOCKUP_DETECTOR - documentation additions V4: - documentation fixes - convert per_cpu to __get_cpu_var - powerpc compile fixes V5: - split apart warn flags for hard and soft lockups TODO: - figure out how to make an arch-agnostic clock2cycles call (if possible) to feed into perf events as a sample period [fweisbec: merged conflict patch] Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/nmi.h | 2 +- arch/x86/kernel/apic/Makefile | 4 ++-- arch/x86/kernel/apic/hw_nmi.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5b41b0f..932f0f8 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); -#if !defined(CONFIG_NMI_WATCHDOG) +#if !defined(CONFIG_LOCKUP_DETECTOR) extern int nmi_watchdog_enabled; #endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 1a4512e..52f32e0 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_NMI_WATCHDOG),y) +ifneq ($(CONFIG_LOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o +obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index e8b78a0..79425f9 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return (u64)(cpu_khz) * 1000 * 60; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bdc7fab..bd347c2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; -#ifndef CONFIG_NMI_WATCHDOG +#ifndef CONFIG_LOCKUP_DETECTOR /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_NMI_WATCHDOG */ +#endif /* !CONFIG_LOCKUP_DETECTOR */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); -- cgit v1.1 From 7cbb7e7fa46f6e5229438ac9e4a5c72ec0d53e0b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:48 -0400 Subject: x86: Move trigger_all_cpu_backtrace to its own die_notifier As part of the transition of the nmi watchdog to something more generic, the trigger_all_cpu_backtrace code is getting left behind. Put it in its own die_notifier so it can still be used. V2: - use arch_spin_locks Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-6-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/hw_nmi.c | 65 +++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 79425f9..8c3edfb 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -17,6 +17,10 @@ #include #include #include +#include +#include +#include + #include #include @@ -54,20 +58,6 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) unsigned int sum; int cpu = smp_processor_id(); - /* FIXME: cheap hack for this check, probably should get its own - * die_notifier handler - */ - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - dump_stack(); - spin_unlock(&lock); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - } - /* if we are doing an mce, just assume the cpu is not stuck */ /* Could check oops_in_progress here too, but it's safer not to */ if (mce_in_progress()) @@ -109,6 +99,53 @@ void arch_trigger_all_cpu_backtrace(void) mdelay(1); } } + +static int __kprobes +arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + int cpu = smp_processor_id(); + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + + arch_spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + arch_spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static __read_mostly struct notifier_block backtrace_notifier = { + .notifier_call = arch_trigger_all_cpu_backtrace_handler, + .next = NULL, + .priority = 1 +}; + +static int __init register_trigger_all_cpu_backtrace(void) +{ + register_die_notifier(&backtrace_notifier); + return 0; +} +early_initcall(register_trigger_all_cpu_backtrace); #endif /* STUB calls to mimic old nmi_watchdog behaviour */ -- cgit v1.1 From 10f9014912a2b1cb59c39cdea777e6d9afa8f17e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:49 -0400 Subject: x86: Cleanup hw_nmi.c cruft The design of the hardlockup watchdog has changed and cruft was left behind in the hw_nmi.c file. Just remove the code that isn't used anymore. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-7-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/hw_nmi.c | 58 ------------------------------------------- 1 file changed, 58 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c3edfb..3b40082 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -9,74 +9,16 @@ * */ -#include -#include #include -#include -#include -#include -#include -#include #include #include #include - - #include #include /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static DEFINE_PER_CPU(unsigned, last_irq_sum); - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; - -#if defined(CONFIG_X86_LOCAL_APIC) - irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; -#endif - - return irqs; -} - -static inline int mce_in_progress(void) -{ -#if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -int hw_nmi_is_cpu_stuck(struct pt_regs *regs) -{ - unsigned int sum; - int cpu = smp_processor_id(); - - /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; - - /* We determine if the cpu is stuck by checking whether any - * interrupts have happened since we last checked. Of course - * an nmi storm could create false positives, but the higher - * level logic should account for that - */ - sum = get_timer_irqs(cpu); - if (__get_cpu_var(last_irq_sum) == sum) { - return 1; - } else { - __get_cpu_var(last_irq_sum) = sum; - return 0; - } -} - u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; -- cgit v1.1 From 5e85391b3badd3f0e50ebdd0cafe0202a979f73a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 May 2010 09:12:39 +0200 Subject: x86, watchdog: Fix build error in hw_nmi.c On some configs the following build error triggers: arch/x86/kernel/apic/hw_nmi.c:35: error: 'apic' undeclared (first use in this function) arch/x86/kernel/apic/hw_nmi.c:35: error: (Each undeclared identifier is reported only once arch/x86/kernel/apic/hw_nmi.c:35: error: for each function it appears in.) Because asm/apic.h was only included implicitly. Include it explicitly. Cc: Frederic Weisbecker Cc: Don Zickus Cc: Peter Zijlstra Cc: Cyrill Gorcunov LKML-Reference: <1273713674-8434-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 3b40082..cefd694 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -8,6 +8,7 @@ * Bits copied from original nmi.c file * */ +#include #include #include -- cgit v1.1 From c01d4323309a90a298fd81cf3a059ee1b12be2e9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 15 May 2010 22:57:48 +0200 Subject: lockup_detector: Adapt CONFIG_PERF_EVENT_NMI to other archs CONFIG_PERF_EVENT_NMI is something that need to be enabled from the arch. This is fine on x86 as PERF_EVENTS is builtin but if other archs select it, they will need to handle the PERF_EVENTS dependency. Instead, handle the dependency in the generic layer: - archs need to tell what they support through HAVE_PERF_EVENTS_NMI - Enable magically PERF_EVENTS_NMI if we have PERF_EVENTS and HAVE_PERF_EVENTS_NMI. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/Kconfig b/arch/Kconfig index e5eb133..89b0efb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -145,4 +145,7 @@ config HAVE_HW_BREAKPOINT config HAVE_USER_RETURN_NOTIFIER bool +config HAVE_PERF_EVENTS_NMI + bool + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3cb28cd..3cb5bb0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -54,7 +54,7 @@ config X86 select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT select PERF_EVENTS - select PERF_EVENTS_NMI + select HAVE_PERF_EVENTS_NMI select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER -- cgit v1.1 From 23637d477c1f53acbb176a02c241d60a25888fae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 15 May 2010 23:15:20 +0200 Subject: lockup_detector: Introduce CONFIG_HARDLOCKUP_DETECTOR This new config is deemed to simplify even more the lockup detector dependencies and can make it easier to bring a smooth sorting between archs that support the new generic lockup detector and those that still have their own, especially for those that are in the middle of this migration. Instead of checking whether we have CONFIG_LOCKUP_DETECTOR + CONFIG_PERF_EVENTS_NMI each time an arch wants to know if it needs to build its own lockup detector, take a shortcut with this new config. It is enabled only if the hardlockup detection part of the whole lockup detector is on. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov --- arch/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/Kconfig b/arch/Kconfig index 89b0efb..35084f2 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -147,5 +147,9 @@ config HAVE_USER_RETURN_NOTIFIER config HAVE_PERF_EVENTS_NMI bool + help + System hardware can generate an NMI using the perf event + subsystem. Also has support for calculating CPU cycle events + to determine how many clock cycles in a given period. source "kernel/gcov/Kconfig" -- cgit v1.1 From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 14 May 2010 11:11:21 -0400 Subject: lockup_detector: Cross arch compile fixes Combining the softlockup and hardlockup code causes watchdog.c to build even without the hardlockup detection support. So if an arch, that has the previous and the new nmi watchdog implementations cohabiting, wants to know if the generic one is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check. We need to use CONFIG_HARDLOCKUP_DETECTOR instead. Fixes: kernel/built-in.o: In function `touch_nmi_watchdog': (.text+0x449bc): multiple definition of `touch_nmi_watchdog' arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov LKML-Reference: <20100514151121.GR15159@redhat.com> [ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI] Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52f32e0..910f20b4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_LOCKUP_DETECTOR),y) +ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o -- cgit v1.1 From b12eab1a796a306caef7046b21a76efa35f5f489 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Tue, 1 Jun 2010 15:43:34 -0400 Subject: powerpc/oprofile: fix potential buffer overrun in op_model_cell.c Fix potential initial_lfsr buffer overrun. Writing past the end of the buffer could happen when index == ENTRIES Signed-off-by: Denis Kirjanov Cc: stable@kernel.org Signed-off-by: Robert Richter --- arch/powerpc/oprofile/op_model_cell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index 2c9e522..7fd90d0 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -1077,7 +1077,7 @@ static int calculate_lfsr(int n) index = ENTRIES-1; /* make sure index is valid */ - if ((index > ENTRIES) || (index < 0)) + if ((index >= ENTRIES) || (index < 0)) index = ENTRIES-1; return initial_lfsr[index]; -- cgit v1.1 From e768aee89c687a50e6a2110e30c5cae1fbf0d2da Mon Sep 17 00:00:00 2001 From: Livio Soares Date: Thu, 3 Jun 2010 15:00:31 -0400 Subject: perf, x86: Small fix to cpuid10_edx Fixes to 'cpuid10_edx' to comply with Intel documentation. According to the Intel Manual, Volume 2A, Table 3-12, the cpuid for architecture performance monitoring returns, in EDX, two pieces of information: 1) Number of fixed-function counters (5 bits, not 4) 2) Width of fixed-function counters (8 bits) Signed-off-by: Livio Soares Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arjan van de Ven Cc: "H. Peter Anvin" LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 254883d..6ed3ae4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -68,8 +68,9 @@ union cpuid10_eax { union cpuid10_edx { struct { - unsigned int num_counters_fixed:4; - unsigned int reserved:28; + unsigned int num_counters_fixed:5; + unsigned int bit_width_fixed:8; + unsigned int reserved:19; } split; unsigned int full; }; -- cgit v1.1 From c9cf4dbb4d9ca715d8fedf13301a53296429abc6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2010 21:35:17 +0200 Subject: x86: Unify dumpstack.h and stacktrace.h arch/x86/include/asm/stacktrace.h and arch/x86/kernel/dumpstack.h declare headers of objects that deal with the same topic. Actually most of the files that include stacktrace.h also include dumpstack.h Although dumpstack.h seems more reserved for internals of stack traces, those are quite often needed to define specialized stack trace operations. And perf event arch headers are going to need access to such low level operations anyway. So don't continue to bother with dumpstack.h as it's not anymore about isolated deep internals. v2: fix struct stack_frame definition conflict in sysprof Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Soeren Sandmann --- arch/x86/include/asm/stacktrace.h | 52 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event.c | 2 -- arch/x86/kernel/dumpstack.c | 1 - arch/x86/kernel/dumpstack.h | 56 --------------------------------------- arch/x86/kernel/dumpstack_32.c | 2 -- arch/x86/kernel/dumpstack_64.c | 1 - arch/x86/kernel/stacktrace.c | 7 ++--- 7 files changed, 56 insertions(+), 65 deletions(-) delete mode 100644 arch/x86/kernel/dumpstack.h (limited to 'arch') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 4dab78e..a957463 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -1,6 +1,13 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H +#include + extern int kstack_depth_to_print; struct thread_info; @@ -42,4 +49,49 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data); +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + +static inline unsigned long rewind_frame_pointer(int n) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + while (n--) { + if (probe_kernel_address(&frame->next_frame, frame)) + break; + } +#endif + + return (unsigned long)frame; +} + #endif /* _ASM_X86_STACKTRACE_H */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c775860..9632fb6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1585,8 +1585,6 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -#include "../dumpstack.h" - static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c89a386..6e8752c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,7 +18,6 @@ #include -#include "dumpstack.h" int panic_on_unrecovered_nmi; int panic_on_io_nmi; diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be..0000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - */ - -#ifndef DUMPSTACK_H -#define DUMPSTACK_H - -#ifdef CONFIG_X86_32 -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) -#else -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) -#endif - -#include - -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); - -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); - -extern unsigned int code_bytes; - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -struct stack_frame_ia32 { - u32 next_frame; - u32 return_address; -}; - -static inline unsigned long rewind_frame_pointer(int n) -{ - struct stack_frame *frame; - - get_bp(frame); - -#ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } -#endif - - return (unsigned long)frame; -} - -#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a1..0f6376f 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,8 +16,6 @@ #include -#include "dumpstack.h" - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1..57a21f1 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,7 +16,6 @@ #include -#include "dumpstack.h" #define N_EXCEPTION_STACKS_END \ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefb..ea54d02 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ -struct stack_frame { +struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; @@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; + struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; -- cgit v1.1 From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 20 May 2010 07:47:21 +0200 Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller Drop this argument now that we always want to rewind only to the state of the first caller. It means frame pointers are not necessary anymore to reliably get the source of an event. But this also means we need this helper to be a macro now, as an inline function is not an option since we need to know when to provide a default implentation. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul Mackerras Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/powerpc/include/asm/perf_event.h | 12 ++++++++++++ arch/powerpc/kernel/misc.S | 26 -------------------------- arch/sparc/include/asm/perf_event.h | 8 ++++++++ arch/sparc/kernel/helpers.S | 6 +++--- arch/x86/include/asm/perf_event.h | 13 +++++++++++++ arch/x86/include/asm/stacktrace.h | 7 ++----- arch/x86/kernel/cpu/perf_event.c | 16 ---------------- 7 files changed, 38 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index e6d4ce6..5c16b89 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -21,3 +21,15 @@ #ifdef CONFIG_FSL_EMB_PERF_EVENT #include #endif + +#ifdef CONFIG_PERF_EVENTS +#include +#include + +#define perf_arch_fetch_caller_regs(regs, __ip) \ + do { \ + (regs)->nip = __ip; \ + (regs)->gpr[1] = *(unsigned long *)__get_SP(); \ + asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ + } while (0) +#endif diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 22e507c..2d29752 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -127,29 +127,3 @@ _GLOBAL(__setup_cpu_power7) _GLOBAL(__restore_cpu_power7) /* place holder */ blr - -/* - * Get a minimal set of registers for our caller's nth caller. - * r3 = regs pointer, r5 = n. - * - * We only get R1 (stack pointer), NIP (next instruction pointer) - * and LR (link register). These are all we can get in the - * general case without doing complicated stack unwinding, but - * fortunately they are enough to do a stack backtrace, which - * is all we need them for. - */ -_GLOBAL(perf_arch_fetch_caller_regs) - mr r6,r1 - cmpwi r5,0 - mflr r4 - ble 2f - mtctr r5 -1: PPC_LL r6,0(r6) - bdnz 1b - PPC_LL r4,PPC_LR_STKOFF(r6) -2: PPC_LL r7,0(r6) - PPC_LL r7,PPC_LR_STKOFF(r7) - PPC_STL r6,GPR1-STACK_FRAME_OVERHEAD(r3) - PPC_STL r4,_NIP-STACK_FRAME_OVERHEAD(r3) - PPC_STL r7,_LINK-STACK_FRAME_OVERHEAD(r3) - blr diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h index 7e26698..74c4e0c 100644 --- a/arch/sparc/include/asm/perf_event.h +++ b/arch/sparc/include/asm/perf_event.h @@ -6,7 +6,15 @@ extern void set_perf_event_pending(void); #define PERF_EVENT_INDEX_OFFSET 0 #ifdef CONFIG_PERF_EVENTS +#include + extern void init_hw_perf_events(void); + +extern void +__perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); + +#define perf_arch_fetch_caller_regs(pt_regs, ip) \ + __perf_arch_fetch_caller_regs(pt_regs, ip, 1); #else static inline void init_hw_perf_events(void) { } #endif diff --git a/arch/sparc/kernel/helpers.S b/arch/sparc/kernel/helpers.S index 92090cc..682fee0 100644 --- a/arch/sparc/kernel/helpers.S +++ b/arch/sparc/kernel/helpers.S @@ -47,9 +47,9 @@ stack_trace_flush: .size stack_trace_flush,.-stack_trace_flush #ifdef CONFIG_PERF_EVENTS - .globl perf_arch_fetch_caller_regs - .type perf_arch_fetch_caller_regs,#function -perf_arch_fetch_caller_regs: + .globl __perf_arch_fetch_caller_regs + .type __perf_arch_fetch_caller_regs,#function +__perf_arch_fetch_caller_regs: /* We always read the %pstate into %o5 since we will use * that to construct a fake %tstate to store into the regs. */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 254883d..02de298 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs); extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) +#include + +/* + * We abuse bit 3 from flags to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ +#define perf_arch_fetch_caller_regs(regs, __ip) { \ + (regs)->ip = (__ip); \ + (regs)->bp = caller_frame_pointer(); \ + (regs)->cs = __KERNEL_CS; \ + regs->flags = 0; \ +} + #else static inline void init_hw_perf_events(void) { } static inline void perf_events_lapic_init(void) { } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a957463..2b16a2a 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -78,17 +78,14 @@ struct stack_frame_ia32 { u32 return_address; }; -static inline unsigned long rewind_frame_pointer(int n) +static inline unsigned long caller_frame_pointer(void) { struct stack_frame *frame; get_bp(frame); #ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } + frame = frame->next_frame; #endif return (unsigned long)frame; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9632fb6..2c075fe 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ - regs->ip = ip; - /* - * perf_arch_fetch_caller_regs adds another call, we need to increment - * the skip level - */ - regs->bp = rewind_frame_pointer(skip + 1); - regs->cs = __KERNEL_CS; - /* - * We abuse bit 3 to pass exact information, see perf_misc_flags - * and the comment with PERF_EFLAGS_EXACT. - */ - regs->flags = 0; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; -- cgit v1.1 From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 17:49:05 +0200 Subject: perf: Cleanup {start,commit,cancel}_txn details Clarify some of the transactional group scheduling API details and change it so that a successfull ->commit_txn also closes the transaction. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274803086.5882.1752.camel@twins> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_event.c | 7 ++++--- arch/sparc/kernel/perf_event.c | 7 ++++--- arch/x86/kernel/cpu/perf_event.c | 14 +++++--------- 3 files changed, 13 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 43b83c3..ac2a8c2 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuhw->group_flag & PERF_EVENT_TXN) goto nocheck; if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) @@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag |= PERF_EVENT_TXN_STARTED; + cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; } @@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuhw->group_flag &= ~PERF_EVENT_TXN; } /* @@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pmu *pmu) for (i = cpuhw->n_txn_start; i < n; ++i) cpuhw->event[i]->hw.config = cpuhw->events[i]; + cpuhw->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 0ec92c8..beeb92f 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto nocheck; if (check_excludes(cpuc->event, n0, 1)) @@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag |= PERF_EVENT_TXN_STARTED; + cpuhw->group_flag |= PERF_EVENT_TXN; } /* @@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); - cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuhw->group_flag &= ~PERF_EVENT_TXN; } /* @@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu) if (sparc_check_constraints(cpuc->event, cpuc->events, n)) return -EAGAIN; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5db5b7d..af04c6fa 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto out; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event) * The events never got scheduled and ->cancel_txn will truncate * the event_list. */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) return; x86_pmu_stop(event); @@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag |= PERF_EVENT_TXN_STARTED; + cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuc->group_flag &= ~PERF_EVENT_TXN; /* * Truncate the collected events. */ @@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - /* - * Clear out the txn count so that ->cancel_txn() which gets - * run after ->commit_txn() doesn't undo things. - */ - cpuc->n_txn = 0; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } -- cgit v1.1 From 68aa00ac0a82e9a876c799bf6be7622b8f1c8517 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 3 Jun 2010 01:23:04 +0400 Subject: perf, x86: Make a second write to performance counter if needed On Netburst PMU we need a second write to a performance counter due to cpu erratum. A simple flag test instead of alternative instructions was choosen because wrmsrl is already a macro and if virtualization is turned on will need an additional wrapper call which is more expencise. nb: we should propably switch to jump-labels as only this facility reach the mainline. Signed-off-by: Cyrill Gorcunov Signed-off-by: Peter Zijlstra Cc: Robert Richter Cc: Lin Ming Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20100602212304.GC5264@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 12 +++++++++++- arch/x86/kernel/cpu/perf_event_p4.c | 9 +++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index af04c6fa..79e1998 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -220,6 +220,7 @@ struct x86_pmu { struct perf_event *event); struct event_constraint *event_constraints; void (*quirks)(void); + int perfctr_second_write; int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); @@ -925,8 +926,17 @@ x86_perf_event_set_period(struct perf_event *event) */ atomic64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base + idx, + wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + + /* + * Due to erratum on certan cpu we need + * a second write to be sure the register + * is updated properly + */ + if (x86_pmu.perfctr_second_write) { + wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + } perf_event_update_userpage(event); diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ae85d69..9286e73 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -829,6 +829,15 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << 39) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + .perfctr_second_write = 1, }; static __init int p4_pmu_init(void) -- cgit v1.1 From 1996bda2a42480c275656233e631ee0966574be4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:05:13 +0200 Subject: arch: Implement local64_t On 64bit, local_t is of size long, and thus we make local64_t an alias. On 32bit, we fall back to atomic64_t. (architecture can provide optimized 32-bit version) (This new facility is to be used by perf events optimizations.) Signed-off-by: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/local64.h | 1 + arch/arm/include/asm/local64.h | 1 + arch/avr32/include/asm/local64.h | 1 + arch/blackfin/include/asm/local64.h | 1 + arch/cris/include/asm/local64.h | 1 + arch/frv/include/asm/local64.h | 1 + arch/frv/kernel/local64.h | 1 + arch/h8300/include/asm/local64.h | 1 + arch/ia64/include/asm/local64.h | 1 + arch/m32r/include/asm/local64.h | 1 + arch/m68k/include/asm/local64.h | 1 + arch/microblaze/include/asm/local64.h | 1 + arch/mips/include/asm/local64.h | 1 + arch/mn10300/include/asm/local64.h | 1 + arch/parisc/include/asm/local64.h | 1 + arch/powerpc/include/asm/local64.h | 1 + arch/s390/include/asm/local64.h | 1 + arch/score/include/asm/local64.h | 1 + arch/sh/include/asm/local64.h | 1 + arch/sparc/include/asm/local64.h | 1 + arch/x86/include/asm/local64.h | 1 + arch/xtensa/include/asm/local64.h | 1 + 22 files changed, 22 insertions(+) create mode 100644 arch/alpha/include/asm/local64.h create mode 100644 arch/arm/include/asm/local64.h create mode 100644 arch/avr32/include/asm/local64.h create mode 100644 arch/blackfin/include/asm/local64.h create mode 100644 arch/cris/include/asm/local64.h create mode 100644 arch/frv/include/asm/local64.h create mode 100644 arch/frv/kernel/local64.h create mode 100644 arch/h8300/include/asm/local64.h create mode 100644 arch/ia64/include/asm/local64.h create mode 100644 arch/m32r/include/asm/local64.h create mode 100644 arch/m68k/include/asm/local64.h create mode 100644 arch/microblaze/include/asm/local64.h create mode 100644 arch/mips/include/asm/local64.h create mode 100644 arch/mn10300/include/asm/local64.h create mode 100644 arch/parisc/include/asm/local64.h create mode 100644 arch/powerpc/include/asm/local64.h create mode 100644 arch/s390/include/asm/local64.h create mode 100644 arch/score/include/asm/local64.h create mode 100644 arch/sh/include/asm/local64.h create mode 100644 arch/sparc/include/asm/local64.h create mode 100644 arch/x86/include/asm/local64.h create mode 100644 arch/xtensa/include/asm/local64.h (limited to 'arch') diff --git a/arch/alpha/include/asm/local64.h b/arch/alpha/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/alpha/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/arm/include/asm/local64.h b/arch/arm/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/arm/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/avr32/include/asm/local64.h b/arch/avr32/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/avr32/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/blackfin/include/asm/local64.h b/arch/blackfin/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/blackfin/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/cris/include/asm/local64.h b/arch/cris/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/cris/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/frv/include/asm/local64.h b/arch/frv/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/frv/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/frv/kernel/local64.h b/arch/frv/kernel/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/frv/kernel/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/h8300/include/asm/local64.h b/arch/h8300/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/h8300/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/ia64/include/asm/local64.h b/arch/ia64/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/ia64/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/m32r/include/asm/local64.h b/arch/m32r/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/m32r/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/m68k/include/asm/local64.h b/arch/m68k/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/m68k/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/microblaze/include/asm/local64.h b/arch/microblaze/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/microblaze/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/mips/include/asm/local64.h b/arch/mips/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/mips/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/mn10300/include/asm/local64.h b/arch/mn10300/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/mn10300/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/parisc/include/asm/local64.h b/arch/parisc/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/parisc/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/powerpc/include/asm/local64.h b/arch/powerpc/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/powerpc/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/s390/include/asm/local64.h b/arch/s390/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/s390/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/score/include/asm/local64.h b/arch/score/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/score/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/sh/include/asm/local64.h b/arch/sh/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/sh/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/sparc/include/asm/local64.h b/arch/sparc/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/sparc/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/x86/include/asm/local64.h @@ -0,0 +1 @@ +#include diff --git a/arch/xtensa/include/asm/local64.h b/arch/xtensa/include/asm/local64.h new file mode 100644 index 0000000..36c93b5 --- /dev/null +++ b/arch/xtensa/include/asm/local64.h @@ -0,0 +1 @@ +#include -- cgit v1.1 From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:43:08 +0200 Subject: perf: Convert perf_event to local_t Since now all modification to event->count (and ->prev_count and ->period_left) are local to a cpu, change then to local64_t so we avoid the LOCK'ed ops. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/arm/kernel/perf_event.c | 18 +++++++++--------- arch/powerpc/kernel/perf_event.c | 34 +++++++++++++++++----------------- arch/sh/kernel/perf_event.c | 6 +++--- arch/sparc/kernel/perf_event.c | 18 +++++++++--------- arch/x86/kernel/cpu/perf_event.c | 18 +++++++++--------- 5 files changed, 47 insertions(+), 47 deletions(-) (limited to 'arch') diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index c457686..5b7cfaf 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -164,20 +164,20 @@ armpmu_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0; if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -185,7 +185,7 @@ armpmu_event_set_period(struct perf_event *event, if (left > (s64)armpmu->max_period) left = armpmu->max_period; - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); armpmu->write_counter(idx, (u64)(-left) & 0xffffffff); @@ -204,18 +204,18 @@ armpmu_event_update(struct perf_event *event, s64 delta; again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = armpmu->read_counter(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -478,7 +478,7 @@ __hw_perf_event_init(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = armpmu->max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } err = 0; diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index ac2a8c2..af1d9a7 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -410,15 +410,15 @@ static void power_pmu_read(struct perf_event *event) * Therefore we treat them like NMIs. */ do { - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); - } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); /* The counters are only 32 bits wide */ delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &event->hw.period_left); + local64_add(delta, &event->count); + local64_sub(delta, &event->hw.period_left); } /* @@ -444,10 +444,10 @@ static void freeze_limited_counters(struct cpu_hw_events *cpuhw, if (!event->hw.idx) continue; val = (event->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); event->hw.idx = 0; delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } } @@ -462,7 +462,7 @@ static void thaw_limited_counters(struct cpu_hw_events *cpuhw, event = cpuhw->limited_counter[i]; event->hw.idx = cpuhw->limited_hwidx[i]; val = (event->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); perf_event_update_userpage(event); } } @@ -666,11 +666,11 @@ void hw_perf_enable(void) } val = 0; if (event->hw.sample_period) { - left = atomic64_read(&event->hw.period_left); + left = local64_read(&event->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; } - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); event->hw.idx = idx; write_pmc(idx, val); perf_event_update_userpage(event); @@ -842,8 +842,8 @@ static void power_pmu_unthrottle(struct perf_event *event) if (left < 0x80000000L) val = 0x80000000L - left; write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); perf_enable(); local_irq_restore(flags); @@ -1109,7 +1109,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) event->hw.config = events[n]; event->hw.event_base = cflags[n]; event->hw.last_period = event->hw.sample_period; - atomic64_set(&event->hw.period_left, event->hw.last_period); + local64_set(&event->hw.period_left, event->hw.last_period); /* * See if we need to reserve the PMU. @@ -1147,16 +1147,16 @@ static void record_and_restart(struct perf_event *event, unsigned long val, int record = 0; /* we don't have to worry about interrupts here */ - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); /* * See if the total period for this event has expired, * and update for the next period. */ val = 0; - left = atomic64_read(&event->hw.period_left) - delta; + left = local64_read(&event->hw.period_left) - delta; if (period) { if (left <= 0) { left += period; @@ -1194,8 +1194,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); } diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 81b6de4..7a3dc35 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -185,10 +185,10 @@ static void sh_perf_event_update(struct perf_event *event, * this is the simplest approach for maintaining consistency. */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = sh_pmu->read(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -203,7 +203,7 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } static void sh_pmu_disable(struct perf_event *event) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index beeb92f..8a6660d 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -572,18 +572,18 @@ static u64 sparc_perf_event_update(struct perf_event *event, s64 delta; again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = read_pmc(idx); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -591,27 +591,27 @@ again: static int sparc_perf_event_set_period(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0; if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (left > MAX_PERIOD) left = MAX_PERIOD; - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); write_pmc(idx, (u64)(-left) & 0xffffffff); @@ -1087,7 +1087,7 @@ static int __hw_perf_event_init(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = MAX_PERIOD; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } return 0; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 79e1998..2d0d290 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) * count to the generic event atomically: */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); rdmsrl(hwc->event_base + idx, new_raw_count); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -314,8 +314,8 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } else { /* * If we have a PMU initialized but no APIC @@ -886,7 +886,7 @@ static int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; @@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) */ if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event) * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); -- cgit v1.1 From 147ec4d2361e355ab32499f739cc24845ceb89da Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jun 2010 21:32:39 +0200 Subject: x86: Make save_stack_address() !CONFIG_FRAME_POINTER friendly If CONFIG_FRAME_POINTER=n, print_context_stack() shouldn't neglect the non-reliable addresses on stack, this is all we have if dump_trace(bp) is called with the wrong or zero bp. For example, /proc/pid/stack doesn't work if CONFIG_FRAME_POINTER=n. This patch obviously has no effect if CONFIG_FRAME_POINTER=y, otherwise it reverts 1650743c "x86: don't save unreliable stack trace entries". Also, remove the unnecessary type-cast. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Arjan van de Ven Cc: Vegard Nossum Cc: Ingo Molnar Cc: Andrew Morton LKML-Reference: <20100603193239.GA31530@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/stacktrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index ea54d02..abc321d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -26,8 +26,10 @@ static int save_stack_stack(void *data, char *name) static void save_stack_address(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER if (!reliable) return; +#endif if (trace->skip > 0) { trace->skip--; return; @@ -39,9 +41,11 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = (struct stack_trace *)data; + struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER if (!reliable) return; +#endif if (in_sched_functions(addr)) return; if (trace->skip > 0) { -- cgit v1.1 From 018378c55b03f88ff513aba4e0e93b8d4a9cf241 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jun 2010 21:32:43 +0200 Subject: x86: Unify save_stack_address() and save_stack_address_nosched() Cleanup. Factor the common code in save_stack_address() and save_stack_address_nosched(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Arjan van de Ven Cc: Vegard Nossum Cc: Ingo Molnar LKML-Reference: <20100603193243.GA31534@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/stacktrace.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index abc321d..b53c525 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -23,13 +23,16 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static void +__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; #ifdef CONFIG_FRAME_POINTER if (!reliable) return; #endif + if (nosched && in_sched_functions(addr)) + return; if (trace->skip > 0) { trace->skip--; return; @@ -38,22 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) trace->entries[trace->nr_entries++] = addr; } +static void save_stack_address(void *data, unsigned long addr, int reliable) +{ + return __save_stack_address(data, addr, reliable, false); +} + static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return; -#endif - if (in_sched_functions(addr)) - return; - if (trace->skip > 0) { - trace->skip--; - return; - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = addr; + return __save_stack_address(data, addr, reliable, true); } static const struct stacktrace_ops save_stack_ops = { -- cgit v1.1 From c882e0feb937af4e5b991cbd1c81536f37053e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20Sch=C3=B6ne?= Date: Mon, 14 Jun 2010 13:37:20 +0200 Subject: x86, perf: Add power_end event to process_*.c cpu_idle routine Systems using the idle thread from process_32.c and process_64.c do not generate power_end events which could be traced using perf. This patch adds the event generation for such systems. Signed-off-by: Robert Schoene Acked-by: Arjan van de Ven Cc: Peter Zijlstra LKML-Reference: <1276515440.5441.45.camel@localhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 4 ++++ arch/x86/kernel/process_64.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d12878..96586c3 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -57,6 +57,8 @@ #include #include +#include + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); /* @@ -111,6 +113,8 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3c2422a..3d9ea53 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,6 +51,8 @@ #include #include +#include + asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); @@ -138,6 +140,9 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); + /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ -- cgit v1.1 From 0c4519e825c9e2b6a8310deff8582f8c35bfbba9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Jun 2010 21:21:27 +0200 Subject: x86: Set resume bit before returning from breakpoint exception Instruction breakpoints trigger before the instruction executes, and returning back from the breakpoint handler brings us again to the instruction that breakpointed. This naturally bring to a breakpoint recursion. To solve this, x86 has the Resume Bit trick. When the cpu flags have the RF flag set, the next instruction won't trigger any instruction breakpoint, and once this instruction is executed, RF is cleared back. This let's us jump back to the instruction that triggered the breakpoint without recursion. Use this when an instruction breakpoint triggers. Signed-off-by: Frederic Weisbecker Cc: Will Deacon Cc: Prasad Cc: Mahesh Salgaonkar Cc: Paul Mackerras Cc: Ingo Molnar Cc: Jason Wessel --- arch/x86/kernel/hw_breakpoint.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a8f1b80..eaa6ae2 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -466,6 +466,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) perf_bp_event(bp, args->regs); + /* + * Set up resume flag to avoid breakpoint recursion when + * returning back to origin. + */ + if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + args->regs->flags |= X86_EFLAGS_RF; + rcu_read_unlock(); } /* -- cgit v1.1 From f7809daf64bf119fef70af172db6a0636fa51f92 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Jun 2010 10:00:24 +0200 Subject: x86: Support for instruction breakpoints Instruction breakpoints need to have a specific length of 0 to be working. Bring this support but also take care the user is not trying to set an unsupported length, like a range breakpoint for example. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Prasad Cc: Mahesh Salgaonkar Cc: Will Deacon Cc: Jason Wessel --- arch/x86/include/asm/hw_breakpoint.h | 2 +- arch/x86/kernel/hw_breakpoint.c | 44 ++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 9422553..528a11e 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -20,10 +20,10 @@ struct arch_hw_breakpoint { #include /* Available HW breakpoint length encodings */ +#define X86_BREAKPOINT_LEN_X 0x00 #define X86_BREAKPOINT_LEN_1 0x40 #define X86_BREAKPOINT_LEN_2 0x44 #define X86_BREAKPOINT_LEN_4 0x4c -#define X86_BREAKPOINT_LEN_EXECUTE 0x40 #ifdef CONFIG_X86_64 #define X86_BREAKPOINT_LEN_8 0x48 diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index eaa6ae2..a474ec3 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type, { /* Len */ switch (x86_len) { + case X86_BREAKPOINT_LEN_X: + *gen_len = sizeof(long); + break; case X86_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp) info->address = bp->attr.bp_addr; + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + /* + * x86 inst breakpoints need to have a specific undefined len. + * But we still need to check userspace is not trying to setup + * an unsupported length, to get a range breakpoint for example. + */ + if (bp->attr.bp_len == sizeof(long)) { + info->len = X86_BREAKPOINT_LEN_X; + return 0; + } + default: + return -EINVAL; + } + /* Len */ switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: @@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp) return -EINVAL; } - /* Type */ - switch (bp->attr.bp_type) { - case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; - break; - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; - break; - case HW_BREAKPOINT_X: - info->type = X86_BREAKPOINT_EXECUTE; - break; - default: - return -EINVAL; - } - return 0; } /* @@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) ret = -EINVAL; switch (info->len) { + case X86_BREAKPOINT_LEN_X: + align = sizeof(long) -1; + break; case X86_BREAKPOINT_LEN_1: align = 0; break; -- cgit v1.1 From 567a9fd86735ccdc897768ed2dacdd5e83a13509 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Jun 2010 14:53:50 +0900 Subject: kprobes/x86: Fix kprobes to skip prefixes correctly Fix resume_execution() and is_IF_modifier() to skip x86 instruction prefixes correctly by using x86 instruction attribute. Without this fix, resume_execution() can't handle instructions which have non-REX prefixes (REX prefixes are skipped). This will cause unexpected kernel panic by hitting bad address when a kprobe hits on two-byte ret (e.g. "repz ret" generated for Athlon/K8 optimization), because it just checks "repz" and can't recognize the "ret" instruction. These prefixes can be found easily with x86 instruction attribute. This patch introduces skip_prefixes() and uses it in resume_execution() and is_IF_modifier() to skip prefixes. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli LKML-Reference: <4C298A6E.8070609@hitachi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 345a4b1..175f85c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to) } /* - * Check for the REX prefix which can only exist on X86_64 - * X86_32 always returns 0 + * Skip the prefixes of the instruction. */ -static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + while (inat_is_legacy_prefix(attr)) { + insn++; + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + } #ifdef CONFIG_X86_64 - if ((*insn & 0xf0) == 0x40) - return 1; + if (inat_is_rex_prefix(attr)) + insn++; #endif - return 0; + return insn; } /* @@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr) */ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) { + /* Skip prefixes */ + insn = skip_prefixes(insn); + switch (*insn) { case 0xfa: /* cli */ case 0xfb: /* sti */ @@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) return 1; } - /* - * on X86_64, 0x40-0x4f are REX prefixes so we need to look - * at the next byte instead.. but of course not recurse infinitely - */ - if (is_REX_prefix(insn)) - return is_IF_modifier(++insn); - return 0; } @@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p, unsigned long orig_ip = (unsigned long)p->addr; kprobe_opcode_t *insn = p->ainsn.insn; - /*skip the REX prefix*/ - if (is_REX_prefix(insn)) - insn++; + /* Skip prefixes */ + insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; switch (*insn) { -- cgit v1.1 From 39ef13a4ac28aa64cfe1bc36e6e00f1096707a28 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 5 Jul 2010 10:09:29 +0800 Subject: perf, x86: P4 PMU -- redesign cache events To support cache events we have reserved the low 6 bits in hw_perf_event::config (which is a part of CCCR register configuration actually). These bits represent Replay Event mertic enumerated in enum P4_PEBS_METRIC. The caller should not care about which exact bits should be set and how -- the caller just chooses one P4_PEBS_METRIC entity and puts it into the config. The kernel will track it and set appropriate additional MSR registers (metrics) when needed. The reason for this redesign was the PEBS enable bit, which should not be set until DS (and PEBS sampling) support will be implemented properly. TODO ==== - PEBS sampling (note it's tricky and works with _one_ counter only so for HT machines it will be not that easy to handle both threads) - tracking of PEBS registers state, a user might need to turn PEBS off completely (ie no PEBS enable, no UOP_tag) but some other event may need it, such events clashes and should not run simultaneously, at moment we just don't support such events - eventually export user space bits in separate header which will allow user apps to configure raw events more conveniently. Signed-off-by: Cyrill Gorcunov Signed-off-by: Lin Ming Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <1278295769.9540.15.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event_p4.h | 99 ++++++++++++----------- arch/x86/kernel/cpu/perf_event_p4.c | 147 ++++++++++++++++++++++++++--------- 2 files changed, 163 insertions(+), 83 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 64a8ebf..def5007 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -19,7 +19,6 @@ #define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */ #define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) #define ARCH_P4_MAX_CCCR (18) -#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2) #define P4_ESCR_EVENT_MASK 0x7e000000U #define P4_ESCR_EVENT_SHIFT 25 @@ -71,10 +70,6 @@ #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) -/* Custom bits in reerved CCCR area */ -#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU - - /* Non HT mask */ #define P4_CCCR_MASK \ (P4_CCCR_OVF | \ @@ -106,8 +101,7 @@ * ESCR and CCCR but rather an only packed value should * be unpacked and written to a proper addresses * - * the base idea is to pack as much info as - * possible + * the base idea is to pack as much info as possible */ #define p4_config_pack_escr(v) (((u64)(v)) << 32) #define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL) @@ -130,8 +124,6 @@ t; \ }) -#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK) - #define P4_CONFIG_HT_SHIFT 63 #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) @@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr) return escr; } +/* + * This are the events which should be used in "Event Select" + * field of ESCR register, they are like unique keys which allow + * the kernel to determinate which CCCR and COUNTER should be + * used to track an event + */ enum P4_EVENTS { P4_EVENT_TC_DELIVER_MODE, P4_EVENT_BPU_FETCH_REQUEST, @@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES { * a caller should use P4_ESCR_EMASK_NAME helper to * pick the EventMask needed, for example * - * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD) + * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) */ enum P4_ESCR_EMASKS { P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0), @@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS { P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1), }; -/* P4 PEBS: stale for a while */ -#define P4_PEBS_METRIC_MASK 0x00001fffU -#define P4_PEBS_UOB_TAG 0x01000000U -#define P4_PEBS_ENABLE 0x02000000U - -/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */ -#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001 -#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002 -#define P4_PEBS__dtlb_load_miss_retired 0x3000004 -#define P4_PEBS__dtlb_store_miss_retired 0x3000004 -#define P4_PEBS__dtlb_all_miss_retired 0x3000004 -#define P4_PEBS__tagged_mispred_branch 0x3018000 -#define P4_PEBS__mob_load_replay_retired 0x3000200 -#define P4_PEBS__split_load_retired 0x3000400 -#define P4_PEBS__split_store_retired 0x3000400 - -#define P4_VERT__1stl_cache_load_miss_retired 0x0000001 -#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001 -#define P4_VERT__dtlb_load_miss_retired 0x0000001 -#define P4_VERT__dtlb_store_miss_retired 0x0000002 -#define P4_VERT__dtlb_all_miss_retired 0x0000003 -#define P4_VERT__tagged_mispred_branch 0x0000010 -#define P4_VERT__mob_load_replay_retired 0x0000001 -#define P4_VERT__split_load_retired 0x0000001 -#define P4_VERT__split_store_retired 0x0000002 - -enum P4_CACHE_EVENTS { - P4_CACHE__NONE, - - P4_CACHE__1stl_cache_load_miss_retired, - P4_CACHE__2ndl_cache_load_miss_retired, - P4_CACHE__dtlb_load_miss_retired, - P4_CACHE__dtlb_store_miss_retired, - P4_CACHE__itlb_reference_hit, - P4_CACHE__itlb_reference_miss, - - P4_CACHE__MAX +/* + * P4 PEBS specifics (Replay Event only) + * + * Format (bits): + * 0-6: metric from P4_PEBS_METRIC enum + * 7 : reserved + * 8 : reserved + * 9-11 : reserved + * + * Note we have UOP and PEBS bits reserved for now + * just in case if we will need them once + */ +#define P4_PEBS_CONFIG_ENABLE (1 << 7) +#define P4_PEBS_CONFIG_UOP_TAG (1 << 8) +#define P4_PEBS_CONFIG_METRIC_MASK 0x3f +#define P4_PEBS_CONFIG_MASK 0xff + +/* + * mem: Only counters MSR_IQ_COUNTER4 (16) and + * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling + */ +#define P4_PEBS_ENABLE 0x02000000U +#define P4_PEBS_ENABLE_UOP_TAG 0x01000000U + +#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK) +#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK) + +#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask)) + +enum P4_PEBS_METRIC { + P4_PEBS_METRIC__none, + + P4_PEBS_METRIC__1stl_cache_load_miss_retired, + P4_PEBS_METRIC__2ndl_cache_load_miss_retired, + P4_PEBS_METRIC__dtlb_load_miss_retired, + P4_PEBS_METRIC__dtlb_store_miss_retired, + P4_PEBS_METRIC__dtlb_all_miss_retired, + P4_PEBS_METRIC__tagged_mispred_branch, + P4_PEBS_METRIC__mob_load_replay_retired, + P4_PEBS_METRIC__split_load_retired, + P4_PEBS_METRIC__split_store_retired, + + P4_PEBS_METRIC__max }; #endif /* PERF_EVENT_P4_H */ + diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 9286e73..107711b 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -21,22 +21,36 @@ struct p4_event_bind { char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ }; -struct p4_cache_event_bind { +struct p4_pebs_bind { unsigned int metric_pebs; unsigned int metric_vert; }; -#define P4_GEN_CACHE_EVENT_BIND(name) \ - [P4_CACHE__##name] = { \ - .metric_pebs = P4_PEBS__##name, \ - .metric_vert = P4_VERT__##name, \ +/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ +#define P4_GEN_PEBS_BIND(name, pebs, vert) \ + [P4_PEBS_METRIC__##name] = { \ + .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ + .metric_vert = vert, \ } -static struct p4_cache_event_bind p4_cache_event_bind_map[] = { - P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), +/* + * note we have P4_PEBS_ENABLE_UOP_TAG always set here + * + * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of + * event configuration to find out which values are to be + * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT + * resgisters + */ +static struct p4_pebs_bind p4_pebs_bind_map[] = { + P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), + P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), + P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), + P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), + P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), + P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), + P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), }; /* @@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = { }, }; -#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ +#define P4_GEN_CACHE_EVENT(event, bit, metric) \ p4_config_pack_escr(P4_ESCR_EVENT(event) | \ P4_ESCR_EMASK_BIT(event, bit)) | \ - p4_config_pack_cccr(cache_event | \ + p4_config_pack_cccr(metric | \ P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) static __initconst const u64 p4_hw_cache_event_ids @@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__1stl_cache_load_miss_retired), + P4_PEBS_METRIC__1stl_cache_load_miss_retired), }, }, [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__2ndl_cache_load_miss_retired), + P4_PEBS_METRIC__2ndl_cache_load_miss_retired), }, }, [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__dtlb_load_miss_retired), + P4_PEBS_METRIC__dtlb_load_miss_retired), }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__dtlb_store_miss_retired), + P4_PEBS_METRIC__dtlb_store_miss_retired), }, }, [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, - P4_CACHE__itlb_reference_hit), + P4_PEBS_METRIC__none), [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, - P4_CACHE__itlb_reference_miss), + P4_PEBS_METRIC__none), }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event) return config; } +static int p4_validate_raw_event(struct perf_event *event) +{ + unsigned int v; + + /* user data may have out-of-bound event index */ + v = p4_config_unpack_event(event->attr.config); + if (v >= ARRAY_SIZE(p4_event_bind_map)) { + pr_warning("P4 PMU: Unknown event code: %d\n", v); + return -EINVAL; + } + + /* + * it may have some screwed PEBS bits + */ + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { + pr_warning("P4 PMU: PEBS are not supported yet\n"); + return -EINVAL; + } + v = p4_config_unpack_metric(event->attr.config); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { + pr_warning("P4 PMU: Unknown metric code: %d\n", v); + return -EINVAL; + } + + return 0; +} + static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); int rc = 0; - unsigned int evnt; u32 escr, cccr; /* @@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) { - /* user data may have out-of-bound event index */ - evnt = p4_config_unpack_event(event->attr.config); - if (evnt >= ARRAY_SIZE(p4_event_bind_map)) { - rc = -EINVAL; + rc = p4_validate_raw_event(event); + if (rc) goto out; - } /* * We don't control raw events so it's up to the caller @@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event) * on HT machine but allow HT-compatible specifics to be * passed on) * + * Note that for RAW events we allow user to use P4_CCCR_RESERVED + * bits since we keep additional info here (for cache events and etc) + * * XXX: HT wide things should check perf_paranoid_cpu() && * CAP_SYS_ADMIN */ event->hw.config |= event->attr.config & (p4_config_pack_escr(P4_ESCR_MASK_HT) | - p4_config_pack_cccr(P4_CCCR_MASK_HT)); + p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); } rc = x86_setup_perfctr(event); @@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) return overflow; } +static void p4_pmu_disable_pebs(void) +{ + /* + * FIXME + * + * It's still allowed that two threads setup same cache + * events so we can't simply clear metrics until we knew + * noone is depending on us, so we need kind of counter + * for "ReplayEvent" users. + * + * What is more complex -- RAW events, if user (for some + * reason) will pass some cache event metric with improper + * event opcode -- it's fine from hardware point of view + * but completely nonsence from "meaning" of such action. + * + * So at moment let leave metrics turned on forever -- it's + * ok for now but need to be revisited! + * + * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + */ +} + static inline void p4_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void) continue; p4_pmu_disable_event(event); } + + p4_pmu_disable_pebs(); +} + +/* configuration must be valid */ +static void p4_pmu_enable_pebs(u64 config) +{ + struct p4_pebs_bind *bind; + unsigned int idx; + + BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); + + idx = p4_config_unpack_metric(config); + if (idx == P4_PEBS_METRIC__none) + return; + + bind = &p4_pebs_bind_map[idx]; + + (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event) int thread = p4_ht_config_thread(hwc->config); u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); unsigned int idx = p4_config_unpack_event(hwc->config); - unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config); struct p4_event_bind *bind; - struct p4_cache_event_bind *bind_cache; u64 escr_addr, cccr; bind = &p4_event_bind_map[idx]; @@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event) cccr = p4_config_unpack_cccr(hwc->config); /* - * it could be Cache event so that we need to - * set metrics into additional MSRs + * it could be Cache event so we need to write metrics + * into additional MSRs */ - BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); - if (idx_cache > P4_CACHE__NONE && - idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) { - bind_cache = &p4_cache_event_bind_map[idx_cache]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert); - } + p4_pmu_enable_pebs(hwc->config); (void)checking_wrmsrl(escr_addr, escr_conf); (void)checking_wrmsrl(hwc->config_base + hwc->idx, -- cgit v1.1 From 4c21adf26f8fcf86a755b9b9f55c2e9fd241e1fb Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Jul 2010 16:59:34 -0700 Subject: x86 cpufreq, perf: Make trace_power_frequency cpufreq driver independent and fix the broken case if a core's frequency depends on others. trace_power_frequency was only implemented in a rather ungeneric way in acpi-cpufreq driver's target() function only. -> Move the call to trace_power_frequency to cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE notifier is triggered. This will support power frequency tracing by all cpufreq drivers. trace_power_frequency did not trace frequency changes correctly when the userspace governor was used or when CPU cores' frequency depend on each other. -> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu which gets switched automatically fixes this. Robert Schoene provided some important fixes on top of my initial quick shot version which are integrated in this patch: - Forgot some changes in power_end trace (TP_printk/variable names) - Variable dummy in power_end must now be cpu_id - Use static 64 bit variable instead of unsigned int for cpu_id [akpm@linux-foundation.org: build fix] Signed-off-by: Thomas Renninger Cc: davej@codemonkey.org.uk Signed-off-by: Ingo Molnar Cc: Dave Jones Acked-by: Arjan van de Ven Cc: Robert Schoene Tested-by: Robert Schoene Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 --- arch/x86/kernel/process.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cdda..cee5263 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e3521..787572d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -371,7 +371,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -478,7 +478,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); -- cgit v1.1 From cc05152ab72d7a65e6ea97d286af4f878c8f7371 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sat, 31 Jul 2010 22:51:01 +0200 Subject: x86,mmiotrace: Add support for tracing STOS instruction Add support for stos access tracing with mmiotrace. Signed-off-by: Marcin Slusarz Acked-by: Pekka Paalanen Cc: Nouveau Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Steven Rostedt LKML-Reference: <20100731205101.GA5860@joi.lan> Signed-off-by: Frederic Weisbecker --- arch/x86/mm/pf_in.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 308e325..38e6d17 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = { static unsigned int reg_rop[] = { 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F }; -static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; static unsigned int imm_wop[] = { 0xC6, 0xC7 }; /* IA32 Manual 3, 3-432*/ -static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA }; static unsigned int rw32[] = { - 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB }; -static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA }; static unsigned int mw16[] = { 0xB70F, 0xBF0F }; -static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB }; static unsigned int mw64[] = {}; #else /* not __i386__ */ static unsigned char prefix_codes[] = { @@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = { static unsigned int reg_rop[] = { 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F }; -static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; static unsigned int imm_wop[] = { 0xC6, 0xC7 }; -static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA }; static unsigned int rw32[] = { - 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB }; /* 8 bit only */ -static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA }; /* 16 bit only */ static unsigned int mw16[] = { 0xB70F, 0xBF0F }; /* 16 or 32 bit */ static unsigned int mw32[] = { 0xC7 }; /* 16, 32 or 64 bit */ -static unsigned int mw64[] = { 0x89, 0x8B }; +static unsigned int mw64[] = { 0x89, 0x8B, 0xAB }; #endif /* not __i386__ */ struct prefix_bits { @@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs) unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) { unsigned int opcode; - unsigned char mod_rm; int reg; unsigned char *p; struct prefix_bits prf; @@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) goto err; do_work: - mod_rm = *p; - reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); + /* for STOS, source register is fixed */ + if (opcode == 0xAA || opcode == 0xAB) { + reg = arg_AX; + } else { + unsigned char mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); + } switch (get_ins_reg_width(ins_addr)) { case 1: return *get_reg_w8(reg, prf.rex, regs); -- cgit v1.1 From 09f86cd093b76b699656eaa82c37ca6d9a02b892 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jul 2010 10:21:22 +0200 Subject: perf, powerpc: Convert the FSL driver to use local64_t For some reason the FSL driver got left out when we converted perf to use local64_t instead of atomic64_t. Signed-off-by: Peter Zijlstra Acked-by: Kumar Gala Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_event_fsl_emb.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c index babccee..fc6a89b 100644 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ b/arch/powerpc/kernel/perf_event_fsl_emb.c @@ -162,15 +162,15 @@ static void fsl_emb_pmu_read(struct perf_event *event) * Therefore we treat them like NMIs. */ do { - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); - } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); /* The counters are only 32 bits wide */ delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &event->hw.period_left); + local64_add(delta, &event->count); + local64_sub(delta, &event->hw.period_left); } /* @@ -296,11 +296,11 @@ static int fsl_emb_pmu_enable(struct perf_event *event) val = 0; if (event->hw.sample_period) { - s64 left = atomic64_read(&event->hw.period_left); + s64 left = local64_read(&event->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; } - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); write_pmc(i, val); perf_event_update_userpage(event); @@ -371,8 +371,8 @@ static void fsl_emb_pmu_unthrottle(struct perf_event *event) if (left < 0x80000000L) val = 0x80000000L - left; write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); perf_enable(); local_irq_restore(flags); @@ -500,7 +500,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) return ERR_PTR(-ENOTSUPP); event->hw.last_period = event->hw.sample_period; - atomic64_set(&event->hw.period_left, event->hw.last_period); + local64_set(&event->hw.period_left, event->hw.last_period); /* * See if we need to reserve the PMU. @@ -541,16 +541,16 @@ static void record_and_restart(struct perf_event *event, unsigned long val, int record = 0; /* we don't have to worry about interrupts here */ - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); /* * See if the total period for this event has expired, * and update for the next period. */ val = 0; - left = atomic64_read(&event->hw.period_left) - delta; + left = local64_read(&event->hw.period_left) - delta; if (period) { if (left <= 0) { left += period; @@ -584,8 +584,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); perf_event_update_userpage(event); } -- cgit v1.1 From 69e77a8b0426ded5d924eea7dbe4eca51e09f530 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Mon, 2 Aug 2010 17:17:18 -0500 Subject: perf, powerpc: fsl_emb: Restore setting perf_sample_data.period Commit 6b95ed345b9faa4ab3598a82991968f2e9f851bb changed from a struct initializer to perf_sample_data_init(), but the setting of the .period member was left out. Signed-off-by: Scott Wood Cc: stable@kernel.org Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_event_fsl_emb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c index fc6a89b..1ba4547 100644 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ b/arch/powerpc/kernel/perf_event_fsl_emb.c @@ -569,6 +569,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, struct perf_sample_data data; perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; if (perf_event_overflow(event, nmi, &data, regs)) { /* -- cgit v1.1