diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 67 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/bugs.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 63 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-inject.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 94 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_intel.c | 168 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel.c | 25 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_uncore.c | 36 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_uncore.h | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/proc.c | 5 |
16 files changed, 438 insertions, 88 deletions
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9d92e19..f7e98a2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -737,6 +737,72 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, } #endif +static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +{ + if (!cpu_has_invlpg) + return; + + tlb_flushall_shift = 5; + + if (c->x86 <= 0x11) + tlb_flushall_shift = 4; +} + +static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->x86 < 0xf) + return; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* + * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB + * characteristics from the CPUID function 0x80000005 instead. + */ + if (c->x86 == 0xf) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + mask = 0xff; + } + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) { + u32 a, b, c, d; + + cpuid(0x80000005, &a, &b, &c, &d); + tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; + } else { + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + } + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + /* Erratum 658 */ + if (c->x86 == 0x15 && c->x86_model <= 0x1f) { + tlb_lli_2m[ENTRIES] = 1024; + } else { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + + cpu_set_tlb_flushall_shift(c); +} + static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, @@ -756,6 +822,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_size_cache = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_detect_tlb = cpu_detect_tlb_amd, .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c97bb7b..d0e910d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -165,10 +165,15 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); #endif check_config(); - check_fpu(); check_hlt(); check_popad(); init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); + + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + check_fpu(); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5fbc3c..7505f7b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -259,23 +259,36 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static int disable_smep __cpuinitdata; static __init int setup_disable_smep(char *arg) { - disable_smep = 1; + setup_clear_cpu_cap(X86_FEATURE_SMEP); return 1; } __setup("nosmep", setup_disable_smep); -static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +static __always_inline void setup_smep(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_SMEP)) { - if (unlikely(disable_smep)) { - setup_clear_cpu_cap(X86_FEATURE_SMEP); - clear_in_cr4(X86_CR4_SMEP); - } else - set_in_cr4(X86_CR4_SMEP); - } + if (cpu_has(c, X86_FEATURE_SMEP)) + set_in_cr4(X86_CR4_SMEP); +} + +static __init int setup_disable_smap(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_SMAP); + return 1; +} +__setup("nosmap", setup_disable_smap); + +static __always_inline void setup_smap(struct cpuinfo_x86 *c) +{ + unsigned long eflags; + + /* This should have been cleared long ago */ + raw_local_save_flags(eflags); + BUG_ON(eflags & X86_EFLAGS_AC); + + if (cpu_has(c, X86_FEATURE_SMAP)) + set_in_cr4(X86_CR4_SMAP); } /* @@ -476,7 +489,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ - "tlb_flushall_shift is 0x%x\n", + "tlb_flushall_shift: %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], @@ -712,8 +725,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; filter_cpuid_features(c, false); - setup_smep(c); - if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); } @@ -798,8 +809,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid; } - setup_smep(c); - get_model_name(c); /* Default name */ detect_nopl(c); @@ -864,6 +873,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); + /* Set up SMEP/SMAP */ + setup_smep(c); + setup_smap(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -942,8 +955,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - if (boot_cpu_data.cpuid_level >= 2) - cpu_detect_tlb(&boot_cpu_data); + cpu_detect_tlb(&boot_cpu_data); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1023,14 +1035,16 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT "%s ", vendor); if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); + printk(KERN_CONT "%s", strim(c->x86_model_id)); else printk(KERN_CONT "%d86", c->x86); + printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); + printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); else - printk(KERN_CONT "\n"); + printk(KERN_CONT ")\n"); print_cpu_msr(c); } @@ -1113,11 +1127,10 @@ void syscall_init(void) /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| + X86_EFLAGS_IOPL|X86_EFLAGS_AC); } -unsigned long kernel_eflags; - /* * Copies of the original ist values from the tss are only accessed during * debugging, no special alignment required. @@ -1297,9 +1310,6 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); - - raw_local_save_flags(kernel_eflags); if (is_uv_system()) uv_cpu_init(); @@ -1352,6 +1362,5 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0a4ce29..198e019 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) int i, j, n; unsigned int regs[4]; unsigned char *desc = (unsigned char *)regs; + + if (c->cpuid_level < 2) + return; + /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index fc4beb3..ddc72f8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) } static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { @@ -194,7 +195,11 @@ static void raise_mce(struct mce *m) put_online_cpus(); } else #endif + { + preempt_disable(); raise_local(); + preempt_enable(); + } } /* Error injection interface */ @@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, * so do it a jiffie or two later everywhere. */ schedule_timeout(2); + + mutex_lock(&mce_inject_mutex); raise_mce(&m); + mutex_unlock(&mce_inject_mutex); return usize; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index ed44c8a..6a05c1d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,6 +28,18 @@ extern int mce_ser; extern struct mce_bank *mce_banks; +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +#endif + +void mce_timer_kick(unsigned long interval); + #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); ssize_t apei_read_mce(struct mce *m, u64 *record_id); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 292d025..29e87d3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly; int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +int mce_bios_cmci_threshold __read_mostly; struct mce_bank *mce_banks __read_mostly; @@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); @@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data) if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); } /* @@ -1283,14 +1293,38 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) + if (mce_notify_irq()) { iv = max(iv / 2, (unsigned long) HZ/100); - else + } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + iv = mce_adjust_timer(iv); + } __this_cpu_write(mce_next_interval, iv); + /* Might have become 0 after CMCI storm subsided */ + if (iv) { + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); + } +} - t->expires = jiffies + iv; - add_timer_on(t, smp_processor_id()); +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long iv = __this_cpu_read(mce_next_interval); + + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); @@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } -static void __mcheck_cpu_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - struct timer_list *t = &__get_cpu_var(mce_timer); - unsigned long iv = check_interval * HZ; + unsigned long iv = mce_adjust_timer(check_interval * HZ); - setup_timer(t, mce_timer_fn, smp_processor_id()); + __this_cpu_write(mce_next_interval, iv); - if (mce_ignore_ce) + if (mce_ignore_ce || !iv) return; - __this_cpu_write(mce_next_interval, iv); - if (!iv) - return; t->expires = round_jiffies(jiffies + iv); add_timer_on(t, smp_processor_id()); } +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); +} + /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { @@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = { * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold */ static int __init mcheck_enable(char *str) { @@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str) mce_ignore_ce = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + mce_bios_cmci_threshold = 1; else if (isdigit(str[0])) { get_option(&str, &tolerant); if (*str == ',') { @@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { &mce_cmci_disabled }; +static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { + __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), + &mce_bios_cmci_threshold +}; + static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, @@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = { &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, + &dev_attr_bios_cmci_threshold.attr, NULL }; @@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; case CPU_DEAD: - case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); + mce_intel_hcpu_update(cpu); break; case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!mce_ignore_ce && check_interval) { - t->expires = round_jiffies(jiffies + - per_cpu(mce_next_interval, cpu)); - add_timer_on(t, cpu); - } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; - case CPU_POST_DEAD: + } + + if (action == CPU_POST_DEAD) { /* intentionally ignoring frozen here */ cmci_rediscover(cpu); - break; } + return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 38e49bc9..5f88abf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -15,6 +15,8 @@ #include <asm/msr.h> #include <asm/mce.h> +#include "mce-internal.h" + /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. @@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); */ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { @@ -53,6 +70,93 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +void mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ + int r; + + if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the + * timer interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + r = atomic_sub_return(1, &cmci_storm_on_cpus); + if (r == 0) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all cpus to go back to SUBSIDED + * state. When that happens we switch back to + * interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_reenable(); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + /* + * We have shiny weather. Let the poll do whatever it + * thinks. + */ + return interval; + } +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_clear(); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_POLL_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -61,33 +165,28 @@ static int cmci_supported(int *banks) */ static void intel_threshold_interrupt(void) { + if (cmci_storm_detect()) + return; machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); } -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static void cmci_discover(int banks, int boot) +static void cmci_discover(int banks) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; - int hdr = 0; int i; + int bios_wrong_thresh = 0; raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; + int bios_zero_thresh = 0; if (test_bit(i, owned)) continue; @@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & MCI_CTL2_CMCI_EN) { - if (test_and_clear_bit(i, owned) && !boot) - print_update("SHD", &hdr, i); + clear_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; + if (!mce_bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & MCI_CTL2_CMCI_EN) { - if (!test_and_set_bit(i, owned) && !boot) - print_update("CMCI", &hdr, i); + set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mce_bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); + if (mce_bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } } /* @@ -156,7 +278,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); + val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } @@ -186,7 +308,7 @@ void cmci_rediscover(int dying) continue; /* Recheck banks in case CPUs don't all have the same */ if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); } set_cpus_allowed_ptr(current, old); @@ -200,7 +322,7 @@ void cmci_reenable(void) { int banks; if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); } static void intel_init_cmci(void) @@ -211,7 +333,7 @@ static void intel_init_cmci(void) return; mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); + cmci_discover(banks); /* * For CPU #0 this runs with still disabled APIC, but that's * ok because only the vector is set up. We still do another diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6605a81..8b6defe 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -586,6 +586,8 @@ extern struct event_constraint intel_westmere_pebs_event_constraints[]; extern struct event_constraint intel_snb_pebs_event_constraints[]; +extern struct event_constraint intel_ivb_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_enable(struct perf_event *event); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 7bfb5be..eebd5ff 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -209,6 +209,15 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config) return -EOPNOTSUPP; } +static const struct perf_event_attr ibs_notsupp = { + .exclude_user = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + .exclude_idle = 1, + .exclude_host = 1, + .exclude_guest = 1, +}; + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -229,6 +238,9 @@ static int perf_ibs_init(struct perf_event *event) if (event->pmu != &perf_ibs->pmu) return -ENOENT; + if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp)) + return -EINVAL; + if (config & ~perf_ibs->config_mask) return -EINVAL; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7f2739e..6bca492 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2008,6 +2008,7 @@ __init int intel_pmu_init(void) break; case 28: /* Atom */ + case 54: /* Cedariew */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2047,7 +2048,6 @@ __init int intel_pmu_init(void) case 42: /* SandyBridge */ case 45: /* SandyBridge, "Romely-EP" */ x86_add_quirk(intel_sandybridge_quirk); - case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, @@ -2072,6 +2072,29 @@ __init int intel_pmu_init(void) pr_cont("SandyBridge events, "); break; + case 58: /* IvyBridge */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("IvyBridge events, "); + break; + default: switch (x86_pmu.version) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index e38d97b..826054a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -407,6 +407,20 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_ivb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 520b426..da02e9c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void) * to have an operational LBR which can freeze * on PMU interrupt */ - if (boot_cpu_data.x86_mask < 10) { + if (boot_cpu_data.x86_model == 28 + && boot_cpu_data.x86_mask < 10) { pr_cont("LBR disabled due to erratum"); return; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 0a55710..99d96a4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -661,6 +661,11 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static struct uncore_event_desc snb_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + { /* end: all zeroes */ }, +}; + static struct attribute *snb_uncore_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -704,6 +709,7 @@ static struct intel_uncore_type snb_uncore_cbox = { .constraints = snb_uncore_cbox_constraints, .ops = &snb_uncore_msr_ops, .format_group = &snb_uncore_format_group, + .event_descs = snb_uncore_events, }; static struct intel_uncore_type *snb_msr_uncores[] = { @@ -1944,7 +1950,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp static struct intel_uncore_box * uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - static struct intel_uncore_box *box; + struct intel_uncore_box *box; box = *per_cpu_ptr(pmu->box, cpu); if (box) @@ -2341,6 +2347,27 @@ int uncore_pmu_event_init(struct perf_event *event) return ret; } +static ssize_t uncore_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); + +static struct attribute *uncore_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group uncore_pmu_attr_group = { + .attrs = uncore_pmu_attrs, +}; + static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; @@ -2378,8 +2405,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) free_percpu(type->pmus[i].box); kfree(type->pmus); type->pmus = NULL; - kfree(type->attr_groups[1]); - type->attr_groups[1] = NULL; + kfree(type->events_group); + type->events_group = NULL; } static void __init uncore_types_exit(struct intel_uncore_type **types) @@ -2431,9 +2458,10 @@ static int __init uncore_type_init(struct intel_uncore_type *type) for (j = 0; j < i; j++) attrs[j] = &type->event_descs[j].attr.attr; - type->attr_groups[1] = events_group; + type->events_group = events_group; } + type->pmu_group = &uncore_pmu_attr_group; type->pmus = pmus; return 0; fail: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 5b81c18..e68a455 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -369,10 +369,12 @@ struct intel_uncore_type { struct intel_uncore_pmu *pmus; struct intel_uncore_ops *ops; struct uncore_event_desc *event_descs; - const struct attribute_group *attr_groups[3]; + const struct attribute_group *attr_groups[4]; }; -#define format_group attr_groups[0] +#define pmu_group attr_groups[0] +#define format_group attr_groups[1] +#define events_group attr_groups[2] struct intel_uncore_ops { void (*init_box)(struct intel_uncore_box *); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 8022c66..fbd8955 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -140,10 +140,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = cpumask_first(cpu_online_mask); - else - *pos = cpumask_next(*pos - 1, cpu_online_mask); + *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; |