diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-03-26 17:18:44 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-03-26 17:19:03 +0200 |
commit | 7fd52392c56361a40f0c630a82b36b95ca31eac6 (patch) | |
tree | 14091de24c6b28ea4cae9826f98aeedb7be091f5 /arch/x86/kernel | |
parent | b01c3a0010aabadf745f3e7fdb9cab682e0a28a2 (diff) | |
parent | e22057c8599373e5caef0bc42bdb95d2a361ab0d (diff) | |
download | op-kernel-dev-7fd52392c56361a40f0c630a82b36b95ca31eac6.zip op-kernel-dev-7fd52392c56361a40f0c630a82b36b95ca31eac6.tar.gz |
Merge branch 'linus' into perf/urgent
Merge reason: we need to fix a non-trivial merge conflict.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel')
46 files changed, 578 insertions, 258 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ce664f3..406ed77 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8c3cdde..359b689 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -180,6 +180,7 @@ static struct apic apic_flat = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -337,6 +338,7 @@ static struct apic apic_physflat = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 775b82b..634ae6c 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -124,6 +124,7 @@ struct apic apic_noop = { .probe = noop_probe, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 09d3d8c..d9ea5f3 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void) return get_apic_id(apic_read(APIC_ID)); } +static int numachip_apic_id_valid(int apicid) +{ + /* Trust what bootloader passes in MADT */ + return 1; +} + static int numachip_apic_id_registered(void) { return physid_isset(read_xapic_id(), phys_cpu_present_map); @@ -223,10 +229,11 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; + setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } @@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = { .name = "NumaConnect system", .probe = numachip_probe, .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 521bead..0cdec70 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -198,6 +198,7 @@ static struct apic apic_bigsmp = { .name = "bigsmp", .probe = probe_bigsmp, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 5d513bc..e42d1d3b9 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fb07275..6d10a66 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3967,18 +3967,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " - "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", + MAX_IO_APICS, nr_ioapics); return 1; } if (!address) { - printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); + pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n"); return 1; } return 0; } +static __init int bad_ioapic_register(int idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + + reg_00.raw = io_apic_read(idx, 0); + reg_01.raw = io_apic_read(idx, 1); + reg_02.raw = io_apic_read(idx, 2); + + if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { + pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", + mpc_ioapic_addr(idx)); + return 1; + } + + return 0; +} + void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; @@ -3995,6 +4013,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) ioapics[idx].mp_config.apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + + if (bad_ioapic_register(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return; + } + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); ioapics[idx].mp_config.apicver = io_apic_get_version(idx); @@ -4015,10 +4039,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) if (gsi_cfg->gsi_end >= gsi_top) gsi_top = gsi_cfg->gsi_end + 1; - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mpc_ioapic_id(idx), - mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), - gsi_cfg->gsi_base, gsi_cfg->gsi_end); + pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", + idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); nr_ioapics++; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index c4a61ca..00d2422 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = numaq_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0787bb3..ff2c1b9 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -92,6 +92,7 @@ static struct apic apic_default = { .name = "default", .probe = probe_default, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 1911442..fea000b 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -496,6 +496,7 @@ static struct apic apic_summit = { .name = "summit", .probe = probe_summit, .acpi_madt_oem_check = summit_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = summit_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 5007958..9193713 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f5373df..bcd1db6 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 79b05b8..fc47714 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f76623c..5d56931 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1234,8 +1234,7 @@ static int suspend(int vetoable) struct apm_user *as; dpm_suspend_start(PMSG_SUSPEND); - - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1259,9 +1258,9 @@ static int suspend(int vetoable) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); - + dpm_resume_start(PMSG_RESUME); dpm_resume_end(PMSG_RESUME); + queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1276,7 @@ static void standby(void) { int err; - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1291,7 +1290,7 @@ static void standby(void) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); } static apm_event_t get_event(void) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 25f24dc..6ab6aa2 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o obj-y += rdrand.o +obj-y += match.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c0f7d68..e494774 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,6 +18,7 @@ #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/debugreg.h> #include <asm/sections.h> #include <linux/topology.h> #include <linux/cpumask.h> @@ -28,6 +29,7 @@ #include <asm/apic.h> #include <asm/desc.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/mtrr.h> #include <linux/numa.h> #include <asm/asm.h> @@ -933,7 +935,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = { { 0xc0011000, 0xc001103b}, }; -static void __cpuinit print_cpu_msr(void) +static void __cpuinit __print_cpu_msr(void) { unsigned index_min, index_max; unsigned index; @@ -997,13 +999,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "\n"); -#ifdef CONFIG_SMP + __print_cpu_msr(); +} + +void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) +{ if (c->cpu_index < show_msr) - print_cpu_msr(); -#else - if (show_msr) - print_cpu_msr(); -#endif + __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) @@ -1045,7 +1047,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1115,7 +1116,6 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 0000000..5502b28 --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,91 @@ +#include <asm/cpu_device_id.h> +#include <asm/processor.h> +#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/slab.h> + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86_cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); + +ssize_t arch_print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr) +{ + int size = PAGE_SIZE; + int i, n; + char *buf = bufptr; + + n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" + "model:%04X:feature:", + boot_cpu_data.x86_vendor, + boot_cpu_data.x86, + boot_cpu_data.x86_model); + size -= n; + buf += n; + size -= 1; + for (i = 0; i < NCAPINTS*32; i++) { + if (boot_cpu_has(i)) { + n = snprintf(buf, size, ",%04X", i); + if (n >= size) { + WARN(1, "x86 features overflow page\n"); + break; + } + size -= n; + buf += n; + } + } + *buf++ = '\n'; + return buf - bufptr; +} + +int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (buf) { + arch_print_cpu_modalias(NULL, NULL, buf); + add_uevent_var(env, "MODALIAS=%s", buf); + kfree(buf); + } + return 0; +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 7395d5f..0c82091 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -54,7 +54,14 @@ static struct severity { #define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) #define MCACOD 0xffff +/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ +#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ +#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ +#define MCACOD_DATA 0x0134 /* Data Load */ +#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ MCESEV( NO, "Invalid", @@ -102,11 +109,24 @@ static struct severity { SER, BITCLR(MCI_STATUS_S) ), - /* AR add known MCACODs here */ MCESEV( PANIC, "Action required with lost events", SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "HT thread notices Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), +#endif MCESEV( PANIC, "Action required: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) @@ -115,11 +135,11 @@ static struct severity { /* known AO MCACODs: */ MCESEV( AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) ), MCESEV( AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) ), MCESEV( SOME, "Action optional: unknown MCACOD", diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2..d086a09 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void) { unsigned int next, i, prev = 0; - next = rcu_dereference_check_mce(mcelog.next); + next = ACCESS_ONCE(mcelog.next); do { struct mce *m; @@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs) irq_work_queue(&__get_cpu_var(mce_irq_work)); } +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mce_ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear) } /* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + +/* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. * @@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) barrier(); /* - * When no restart IP must always kill or panic. + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } - /* - * Kill on action required. - */ - if (severity == MCE_AR_SEVERITY) - kill_it = 1; - - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); /* * Action optional error. Queue address for later processing. @@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } } + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + if (!no_way_out) mce_clear_state(toclear); @@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - * - * This is mainly used in the case when the system doesn't - * support MCE broadcasting or it has been disabled. - */ - if (no_way_out && tolerant < 3) - mce_panic("Fatal machine check on current CPU", final, msg); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. */ - - if (kill_it && tolerant < 3) - force_sig(SIGBUS, current); - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); + if (tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } if (worst > 0) mce_report_event(regs); @@ -1094,34 +1146,57 @@ out: } EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) { - printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + + return 0; } +#endif /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. */ void mce_notify_process(void) { unsigned long pfn; - mce_notify_irq(); - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR); + struct mce_info *mi = mce_find_info(); + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ static void mce_process_work(struct work_struct *dummy) { - mce_notify_process(); + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); } #ifdef CONFIG_X86_MCE_INTEL @@ -1211,8 +1286,6 @@ int mce_notify_irq(void) /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, &mce_need_notify)) { /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); @@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) /* Error or no more MCE record */ if (rc <= 0) { mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; return rc; } rc = -EFAULT; @@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -struct device *mce_device[CONFIG_NR_CPUS]; +DEFINE_PER_CPU(struct device *, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); - mce_device[cpu] = dev; + per_cpu(mce_device, cpu) = dev; return 0; error2: @@ -2055,7 +2134,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); - mce_device[cpu] = NULL; + per_cpu(mce_device, cpu) = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e4eeaaf..99b5717 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -587,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) @@ -667,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&mce_device[cpu]->kobj, name); + dev = per_cpu(mce_device, cpu); + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -679,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ef8104..f02672a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) /* Prefer fixed purpose counters */ if (x86_pmu.num_counters_fixed) { idx = X86_PMC_IDX_FIXED; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c7f64e6..addf9e8 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 642f75a..11891ca 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!userbuf) { memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); } else { if (!kdump_buf_page) { printk(KERN_WARNING "Kdump: Kdump buffer page not" " allocated\n"); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); return -EFAULT; } copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); if (copy_to_user(buf, (kdump_buf_page + offset), csize)) return -EFAULT; } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 5282179..3ae2ced 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,6 +4,7 @@ #include <linux/bootmem.h> #include <linux/export.h> #include <linux/io.h> +#include <linux/irqdomain.h> #include <linux/interrupt.h> #include <linux/list.h> #include <linux/of.h> @@ -17,64 +18,14 @@ #include <linux/initrd.h> #include <asm/hpet.h> -#include <asm/irq_controller.h> #include <asm/apic.h> #include <asm/pci_x86.h> __initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; -static LIST_HEAD(irq_domains); -static DEFINE_RAW_SPINLOCK(big_irq_lock); int __initdata of_ioapic; -#ifdef CONFIG_X86_IO_APIC -static void add_interrupt_host(struct irq_domain *ih) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_add(&ih->l, &irq_domains); - raw_spin_unlock_irqrestore(&big_irq_lock, flags); -} -#endif - -static struct irq_domain *get_ih_from_node(struct device_node *controller) -{ - struct irq_domain *ih, *found = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_for_each_entry(ih, &irq_domains, l) { - if (ih->controller == controller) { - found = ih; - break; - } - } - raw_spin_unlock_irqrestore(&big_irq_lock, flags); - return found; -} - -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *ih; - u32 virq, type; - int ret; - - ih = get_ih_from_node(controller); - if (!ih) - return 0; - ret = ih->xlate(ih, intspec, intsize, &virq, &type); - if (ret) - return 0; - if (type == IRQ_TYPE_NONE) - return virq; - irq_set_irq_type(virq, type); - return virq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - unsigned long pci_address_to_pio(phys_addr_t address) { /* @@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, - u32 *out_hwirq, u32 *out_type) +static int ioapic_xlate(struct irq_domain *domain, + struct device_node *controller, + const u32 *intspec, u32 intsize, + irq_hw_number_t *out_hwirq, u32 *out_type) { - struct mp_ioapic_gsi *gsi_cfg; struct io_apic_irq_attr attr; struct of_ioapic_type *it; - u32 line, idx, type; + u32 line, idx; + int rc; - if (intsize < 2) + if (WARN_ON(intsize < 2)) return -EINVAL; - line = *intspec; - idx = (u32) id->priv; - gsi_cfg = mp_ioapic_gsi_routing(idx); - *out_hwirq = line + gsi_cfg->gsi_base; - - intspec++; - type = *intspec; + line = intspec[0]; - if (type >= ARRAY_SIZE(of_ioapic_type)) + if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = of_ioapic_type + type; - *out_type = it->out_type; + it = &of_ioapic_type[intspec[1]]; + idx = (u32) domain->host_data; set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); + rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), + cpu_to_node(0), &attr); + if (rc) + return rc; + + *out_hwirq = line; + *out_type = it->out_type; + return 0; } +const struct irq_domain_ops ioapic_irq_domain_ops = { + .xlate = ioapic_xlate, +}; + static void __init ioapic_add_ofnode(struct device_node *np) { struct resource r; @@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np) for (i = 0; i < nr_ioapics; i++) { if (r.start == mpc_ioapic_addr(i)) { struct irq_domain *id; + struct mp_ioapic_gsi *gsi_cfg; + + gsi_cfg = mp_ioapic_gsi_routing(i); - id = kzalloc(sizeof(*id), GFP_KERNEL); + id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, + &ioapic_irq_domain_ops, + (void*)i); BUG_ON(!id); - id->controller = np; - id->xlate = ioapic_xlate; - id->priv = (void *)i; - add_interrupt_host(id); return; } } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c99f9ed..88ec912 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs) int i; print_modules(); - __show_regs(regs, 0); + __show_regs(regs, !user_mode_vm(regs)); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 79d97e6..7b784f4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -98,12 +98,6 @@ #endif .endm -#ifdef CONFIG_VM86 -#define resume_userspace_sig check_userspace -#else -#define resume_userspace_sig resume_userspace -#endif - /* * User gs save/restore * @@ -327,10 +321,19 @@ ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) -check_userspace: +resume_userspace_sig: +#ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from a syscall done in the kernel space, + * e.g. a failed kernel_execve(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1333d98..734ebd1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -320,7 +320,7 @@ ENDPROC(native_usergs_sysret64) movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS(%rdi) + testl $3, CS-RBP(%rsi) je 1f SWAPGS /* @@ -330,11 +330,10 @@ ENDPROC(native_usergs_sysret64) * moving irq_enter into assembly, which would be too much work) */ 1: incl PER_CPU_VAR(irq_count) - jne 2f - mov PER_CPU_VAR(irq_stack_ptr),%rsp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi -2: /* Store previous stack value */ + /* Store previous stack value */ pushq %rsi CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 0x77 /* DW_OP_breg7 */, 0, \ @@ -813,7 +812,7 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA_REGISTER rsi + CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ leaq ARGOFFSET-RBP(%rsi), %rsp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET @@ -1530,6 +1529,7 @@ ENTRY(nmi) /* Use %rdx as out temp variable throughout */ pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 /* * If %cs was not the kernel segment, then the NMI triggered in user @@ -1554,6 +1554,7 @@ ENTRY(nmi) */ lea 6*8(%rsp), %rdx test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + CFI_REMEMBER_STATE nested_nmi: /* @@ -1585,10 +1586,12 @@ nested_nmi: nested_nmi_out: popq_cfi %rdx + CFI_RESTORE rdx /* No need to check faults here */ INTERRUPT_RETURN + CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1620,10 +1623,15 @@ first_nmi: * | pt_regs | * +-------------------------+ * - * The saved RIP is used to fix up the copied RIP that a nested - * NMI may zero out. The original stack frame and the temp storage + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage * is also used by nested NMIs and can not be trusted on exit. */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ + movq (%rsp), %rdx + CFI_RESTORE rdx + /* Set the NMI executing variable on the stack. */ pushq_cfi $1 @@ -1631,22 +1639,39 @@ first_nmi: .rept 5 pushq_cfi 6*8(%rsp) .endr + CFI_DEF_CFA_OFFSET SS+8-RIP + + /* Everything up to here is safe from nested NMIs */ + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 5*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ .rept 5 pushq_cfi 4*8(%rsp) .endr - - /* Do not pop rdx, nested NMIs will corrupt it */ - movq 11*8(%rsp), %rdx + CFI_DEF_CFA_OFFSET SS+8-RIP +end_repeat_nmi: /* * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception. Repeated NMIs - * caused by an exception and nested NMI will start here, and - * can still be preempted by another NMI. + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. */ -restart_nmi: pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1675,26 +1700,6 @@ nmi_restore: CFI_ENDPROC END(nmi) - /* - * If an NMI hit an iret because of an exception or breakpoint, - * it can lose its NMI context, and a nested NMI may come in. - * In that case, the nested NMI will change the preempted NMI's - * stack to jump to here when it does the final iret. - */ -repeat_nmi: - INTR_FRAME - /* Update the stack variable to say we are still in NMI */ - movq $1, 5*8(%rsp) - - /* copy the saved stack back to copy stack */ - .rept 5 - pushq_cfi 4*8(%rsp) - .endr - - jmp restart_nmi - CFI_ENDPROC -end_repeat_nmi: - ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 739d859..7734bcb 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -16,6 +16,7 @@ #include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/user.h> #ifdef CONFIG_X86_64 @@ -32,6 +33,86 @@ # define user32_fxsr_struct user_fxsr_struct #endif +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + return !__thread_has_fpu(current) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void kernel_fpu_begin(void) +{ + struct task_struct *me = current; + + WARN_ON_ONCE(!irq_fpu_usable()); + preempt_disable(); + if (__thread_has_fpu(me)) { + __save_init_fpu(me); + __thread_clear_has_fpu(me); + /* We do 'stts()' in kernel_fpu_end() */ + } else { + percpu_write(fpu_owner_task, NULL); + clts(); + } +} +EXPORT_SYMBOL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + stts(); + preempt_enable(); +} +EXPORT_SYMBOL(kernel_fpu_end); + +void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; + preempt_enable(); +} +EXPORT_SYMBOL(unlazy_fpu); + #ifdef CONFIG_MATH_EMULATION # define HAVE_HWFP (boot_cpu_data.hard_math) #else @@ -44,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size); unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; -void __cpuinit mxcsr_feature_mask_init(void) +static void __cpuinit mxcsr_feature_mask_init(void) { unsigned long mask = 0; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5c..43e2b1cf 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -306,10 +306,10 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + i = FIRST_EXTERNAL_VECTOR; + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - if (!test_bit(i, used_vectors)) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } if (!acpi_ioapic && !of_ioapic) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba577..fdc37b3 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "ss", 4, offsetof(struct pt_regs, ss) }, { "ds", 4, offsetof(struct pt_regs, ds) }, { "es", 4, offsetof(struct pt_regs, es) }, - { "fs", 4, -1 }, - { "gs", 4, -1 }, #else { "ax", 8, offsetof(struct pt_regs, ax) }, { "bx", 8, offsetof(struct pt_regs, bx) }, @@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "flags", 4, offsetof(struct pt_regs, flags) }, { "cs", 4, offsetof(struct pt_regs, cs) }, { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, #endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, }; int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fda91c3..87a0f86 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -86,6 +86,7 @@ #include <asm/microcode.h> #include <asm/processor.h> +#include <asm/cpu_device_id.h> MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD + { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif + {} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 0d01a8e..2c39dcd 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -12,6 +12,7 @@ #include <linux/smp.h> #include <linux/cpumask.h> #include <linux/delay.h> +#include <linux/init.h> #include <asm/apic.h> #include <asm/nmi.h> @@ -20,35 +21,35 @@ #define FAILURE 1 #define TIMEOUT 2 -static int nmi_fail; +static int __initdata nmi_fail; /* check to see if NMI IPIs work on this machine */ -static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; -static int testcase_total; -static int testcase_successes; -static int expected_testcase_failures; -static int unexpected_testcase_failures; -static int unexpected_testcase_unknowns; +static int __initdata testcase_total; +static int __initdata testcase_successes; +static int __initdata expected_testcase_failures; +static int __initdata unexpected_testcase_failures; +static int __initdata unexpected_testcase_unknowns; -static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) { unexpected_testcase_unknowns++; return NMI_HANDLED; } -static void init_nmi_testsuite(void) +static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); } -static void cleanup_nmi_testsuite(void) +static void __init cleanup_nmi_testsuite(void) { unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); } -static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) { int cpu = raw_smp_processor_id(); @@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) return NMI_DONE; } -static void test_nmi_ipi(struct cpumask *mask) +static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; @@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask) return; } -static void remote_ipi(void) +static void __init remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); @@ -94,19 +95,19 @@ static void remote_ipi(void) test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -static void local_ipi(void) +static void __init local_ipi(void) { cpumask_clear(to_cpumask(nmi_ipi_mask)); cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -static void reset_nmi(void) +static void __init reset_nmi(void) { nmi_fail = 0; } -static void dotest(void (*testcase_fn)(void), int expected) +static void __init dotest(void (*testcase_fn)(void), int expected) { testcase_fn(); /* @@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected) reset_nmi(); } -static inline void print_testname(const char *testname) +static inline void __init print_testname(const char *testname) { printk("%12s:", testname); } -void nmi_selftest(void) +void __init nmi_selftest(void) { init_nmi_testsuite(); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ada2f99..9c57c02 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -26,6 +26,7 @@ #include <asm/bug.h> #include <asm/paravirt.h> +#include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769..28e5e06 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + if (forbid_dac == 0) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); forbid_dac = 1; } } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); #endif diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 34e06e8..0bc72e2 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <linux/export.h> +#include <asm/probe_roms.h> #include <asm/pci-direct.h> #include <asm/e820.h> #include <asm/mmzone.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 44eefde..14baf78 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,6 +21,7 @@ #include <asm/idle.h> #include <asm/uaccess.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/debugreg.h> struct kmem_cache *task_xstate_cachep; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 49888fe..9d7d484 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -45,6 +45,7 @@ #include <asm/ldt.h> #include <asm/processor.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/desc.h> #ifdef CONFIG_MATH_EMULATION #include <asm/math_emu.h> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e34257c..292da13 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -43,6 +43,7 @@ #include <asm/system.h> #include <asm/processor.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> @@ -340,6 +341,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 5026738..78f05e4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -27,6 +27,7 @@ #include <asm/system.h> #include <asm/processor.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d7d5099..8863888 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI_LOADER_SIGNATURE, 4)) { + "EL32", 4)) { efi_enabled = 1; - efi_memblock_x86_reserve_range(); + efi_64bit = false; + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) { + efi_enabled = 1; + efi_64bit = true; } + if (efi_enabled && efi_memblock_x86_reserve_range()) + efi_enabled = 0; #endif x86_init.oem.arch_setup(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc..25edcfc 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -24,6 +24,7 @@ #include <asm/processor.h> #include <asm/ucontext.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/vdso.h> #include <asm/mce.h> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58f7816..e578a79 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -727,8 +727,6 @@ do_rest: * the targeted processor. */ - printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); - atomic_set(&init_deasserted, 0); if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { @@ -778,9 +776,10 @@ do_rest: schedule(); } - if (cpumask_test_cpu(cpu, cpu_callin_mask)) + if (cpumask_test_cpu(cpu, cpu_callin_mask)) { + print_cpu_msr(&cpu_data(cpu)); pr_debug("CPU%d: has booted.\n", cpu); - else { + } else { boot_error = 1; if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) == 0xA5A5A5A5) @@ -834,7 +833,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || - (!x2apic_mode && apicid >= 255)) { + !apic->apic_id_valid(apicid)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 0514890..ef59642 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; + unsigned long addr = addr0, start_addr; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, mm->free_area_cache = mm->mmap_base; } +try_again: /* either no address requested or can't fit in requested address hole */ - addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (addr > len) { - unsigned long tmp_addr = align_addr(addr - len, filp, - ALIGN_TOPDOWN); - - vma = find_vma(mm, tmp_addr); - if (!vma || tmp_addr + len <= vma->vm_start) - /* remember the address as a hint for next time */ - return mm->free_area_cache = tmp_addr; - } - - if (mm->mmap_base < len) - goto bottomup; + start_addr = addr = mm->free_area_cache; - addr = mm->mmap_base-len; + if (addr < len) + goto fail; + addr -= len; do { addr = align_addr(addr, filp, ALIGN_TOPDOWN); @@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = vma->vm_start-len; } while (len < vma->vm_start); +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + */ + if (start_addr != mm->mmap_base) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + goto try_again; + } + bottomup: /* * A failed mmap() very likely causes application failure, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bbe04d..ec61d4c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,6 +54,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/mce.h> #include <asm/mach_traps.h> diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab..328cb37 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) spinlock_t *ptl; int i; + down_write(&mm->mmap_sem); pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; @@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) } pte_unmap_unlock(pte, ptl); out: + up_write(&mm->mmap_sem); flush_tlb(); } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 7110911..e62728e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -6,6 +6,7 @@ #include <linux/bootmem.h> #include <linux/compat.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #ifdef CONFIG_IA32_EMULATION #include <asm/sigcontext32.h> #endif |