diff options
Diffstat (limited to 'arch/x86/kernel')
56 files changed, 1106 insertions, 584 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77b220..0925676 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e2..28595d6 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -104,7 +104,7 @@ _start: movl %eax, %ecx orl %edx, %ecx jz 1f - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx wrmsr 1: diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index fcc3c61f..33cec15 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -2,7 +2,7 @@ * sleep.c - x86-specific ACPI sleep support. * * Copyright (C) 2001-2003 Patrick Mochel - * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz> */ #include <linux/acpi.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7023773..f65ab8b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0d20286..fa044e1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, static int amd_iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap) { + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return 1; + } + return 0; } @@ -2609,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - + for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a353475..8dd7780 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,10 +43,11 @@ #include <asm/fixmap.h> #include <asm/apb_timer.h> +#include <asm/mrst.h> #define APBT_MASK CLOCKSOURCE_MASK(32) #define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKEVENT_RATING 110 #define APBT_CLOCKSOURCE_RATING 250 #define APBT_MIN_DELTA_USEC 200 @@ -83,8 +84,6 @@ struct apbt_dev { char name[10]; }; -int disable_apbt_percpu __cpuinitdata; - static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); #ifdef CONFIG_SMP @@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = { }; /* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); - -/* * start count down from 0xffff_ffff. this is done by toggling the enable bit * then load initial load count to ~0. */ @@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) adev->num = smp_processor_id(); memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", @@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (disable_apbt_percpu || !apb_timer_block_enabled) + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || + !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ hotcpu_notifier(apbt_cpuhp_notify, -20); @@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, int timer_num; struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + BUG_ON(!apbt_virt_address); + timer_num = adev->num; pr_debug("%s CPU %d timer %d mode=%d\n", __func__, first_cpu(*evt->cpumask), timer_num, mode); @@ -676,7 +655,7 @@ void __init apbt_time_init(void) } #ifdef CONFIG_SMP /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { printk(KERN_INFO "apbt: disabled per cpu timer\n"); return; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0b..a2e0caf 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - u32 agp_aper_base = 0, agp_aper_order = 0; + u32 agp_aper_order = 0; int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; @@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ - agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + search_agp_bridge(&agp_aper_order, &valid_agp); fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a96489e..980508c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -460,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask) } /* - * Setup the local APIC timer for this CPU. Copy the initilized values + * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ static void __cpuinit setup_APIC_timer(void) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 425e53a..8593582 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -129,7 +129,6 @@ int es7000_plat; * GSI override for ES7000 platforms. */ -static unsigned int base; static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e41ed24..4dc0084 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) cfg = desc->chip_data; - read_msi_msg_desc(desc, &msg); + get_cached_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c4f9182..4c9c67b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -140,7 +140,7 @@ * is now the way life works). * Fix thinko in suspend() (wrong return). * Notify drivers on critical suspend. - * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> + * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz> * modified by sfr). * Disable interrupts while we are suspended (Andy Henroid * <andy_henroid@yahoo.com> fixed by sfr). diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da..3f0ebe4 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,11 +12,11 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o -obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825..60a57b1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - if (c->x86 == 0x10 || c->x86 == 0x11) + if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ @@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } - if (c->x86 >= 0xf && c->x86 <= 0x11) + if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { @@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) fam10h_check_enable_mmcfg(); } - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + if (c == &boot_cpu_data && c->x86 >= 0xf) { unsigned long long tseg; /* @@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { }; cpu_dev_register(amd_cpu_dev); + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const + * int[] in arch/x86/include/asm/processor.h. + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_400); + +const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_383); + +bool cpu_has_amd_erratum(const int *erratum) +{ + struct cpuinfo_x86 *cpu = ¤t_cpu_data; + int osvw_id = *erratum++; + u32 range; + u32 ms; + + /* + * If called early enough that current_cpu_data hasn't been initialized + * yet, fall back to boot_cpu_data. + */ + if (cpu->x86 == 0) + cpu = &boot_cpu_data; + + if (cpu->x86_vendor != X86_VENDOR_AMD) + return false; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 8) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + +EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c deleted file mode 100644 index 2056ccf..0000000 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * cmpxchg*() fallbacks for CPU not supporting these instructions - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 68e4a6f..490dac6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); return 1; } __setup("noxsave", x86_xsave_setup); +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[4] = excap; } + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + if (eax > 0) + c->x86_capability[9] = ebx; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + init_scattered_cpuid_features(c); } static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - init_scattered_cpuid_features(c); detect_nopl(c); } @@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); + xsave_init(); raw_local_save_flags(kernel_eflags); @@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); - /* - * Boot processor to setup the FP and extended state context info. - */ - if (smp_processor_id() == boot_cpu_id) - init_thread_xstate(); - + fpu_init(); xsave_init(); } #endif diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cee5263..246cd3a 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -348,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -364,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 16e3483..32974cf 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -169,12 +169,9 @@ static int gx_freq_mult[16] = { * Low Level chipset interface * ****************************************************************/ static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, - PCI_ANY_ID, PCI_ANY_ID }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, { 0, }, }; @@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void) } /* detect which companion chip is used */ - while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { + for_each_pci_dev(gx_pci) { if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) return gx_pci; } diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 7e7eea4..03162da 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -426,7 +426,7 @@ static int guess_fsb(int mult) } -static int __init longhaul_get_ranges(void) +static int __cpuinit longhaul_get_ranges(void) { unsigned int i, j, k = 0; unsigned int ratio; @@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void) } -static void __init longhaul_setup_voltagescaling(void) +static void __cpuinit longhaul_setup_voltagescaling(void) { union msr_longhaul longhaul; struct mV_pos minvid, maxvid, vid; @@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void) return 0; } -static int __init longhaul_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *c = &cpu_data(0); char *cpuname = NULL; diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h index e2360a4..cbf48fb 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h @@ -56,7 +56,7 @@ union msr_longhaul { /* * VIA C3 Samuel 1 & Samuel 2 (stepping 0) */ -static const int __initdata samuel1_mults[16] = { +static const int __cpuinitdata samuel1_mults[16] = { -1, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = { -1, /* 1111 -> RESERVED */ }; -static const int __initdata samuel1_eblcr[16] = { +static const int __cpuinitdata samuel1_eblcr[16] = { 50, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = { /* * VIA C3 Samuel2 Stepping 1->15 */ -static const int __initdata samuel2_eblcr[16] = { +static const int __cpuinitdata samuel2_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = { /* * VIA C3 Ezra */ -static const int __initdata ezra_mults[16] = { +static const int __cpuinitdata ezra_mults[16] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = { 120, /* 1111 -> 12.0x */ }; -static const int __initdata ezra_eblcr[16] = { +static const int __cpuinitdata ezra_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = { /* * VIA C3 (Ezra-T) [C5M]. */ -static const int __initdata ezrat_mults[32] = { +static const int __cpuinitdata ezrat_mults[32] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = { -1, /* 1111 -> RESERVED (12.0x) */ }; -static const int __initdata ezrat_eblcr[32] = { +static const int __cpuinitdata ezrat_eblcr[32] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = { /* * VIA C3 Nehemiah */ -static const int __initdata nehemiah_mults[32] = { +static const int __cpuinitdata nehemiah_mults[32] = { 100, /* 0000 -> 10.0x */ -1, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = { -1, /* 1111 -> 12.0x */ }; -static const int __initdata nehemiah_eblcr[32] = { +static const int __cpuinitdata nehemiah_eblcr[32] = { 50, /* 0000 -> 5.0x */ 160, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -315,7 +315,7 @@ struct mV_pos { unsigned short pos; }; -static const struct mV_pos __initdata vrm85_mV[32] = { +static const struct mV_pos __cpuinitdata vrm85_mV[32] = { {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, @@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = { {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} }; -static const unsigned char __initdata mV_vrm85[32] = { +static const unsigned char __cpuinitdata mV_vrm85[32] = { 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 }; -static const struct mV_pos __initdata mobilevrm_mV[32] = { +static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, @@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = { {675, 3}, {650, 2}, {625, 1}, {600, 0} }; -static const unsigned char __initdata mV_mobilevrm[32] = { +static const unsigned char __cpuinitdata mV_mobilevrm[32] = { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index e7b559d..fc09f14 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) +static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, + unsigned int *high_freq) { u32 msr_lo, msr_hi; u32 save_lo, save_hi; @@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, } -static int __init longrun_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) { int result = 0; diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 7b8a8ba..bd1cac7 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } } - if (c->x86 != 0xF) { - if (!cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Unknown CPU. " - "Please send an e-mail to " - "<cpufreq@vger.kernel.org>\n"); + if (c->x86 != 0xF) return 0; - } /* on P-4s, the TSC runs with constant frequency independent whether * throttling is active or not. */ diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index a36de5b..994230d 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -110,7 +110,7 @@ struct pcc_cpu { u32 output_offset; }; -static struct pcc_cpu *pcc_cpu_info; +static struct pcc_cpu __percpu *pcc_cpu_info; static int pcc_cpufreq_verify(struct cpufreq_policy *policy) { diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 9a97116..4a45fd6 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy) * We will then get the same kind of behaviour already tested under * the "well-known" other OS. */ -static int __init fixup_sgtc(void) +static int __cpuinit fixup_sgtc(void) { unsigned int sgtc; unsigned int m; @@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu) } -static int __init acer_cpufreq_pst(const struct dmi_system_id *d) +static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) { printk(KERN_WARNING PFX "%s laptop with broken PST tables in BIOS detected.\n", @@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d) * A BIOS update is all that can save them. * Mention this, and disable cpufreq. */ -static struct dmi_system_id __initdata powernow_dmi_table[] = { +static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { { .callback = acer_cpufreq_pst, .ident = "Acer Aspire", @@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = { { } }; -static int __init powernow_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) { union msr_fidvidstatus fidvidstatus; int result; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3e90cce..491977b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -9,7 +9,7 @@ * Based on the powernow-k7.c module written by Dave Jones. * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski <linux@brodo.de> - * (C) 2004 Pavel Machek <pavel@suse.cz> + * (C) 2004 Pavel Machek <pavel@ucw.cz> * Licensed under the terms of the GNU GPL License version 2. * Based upon datasheets & sample CPUs kindly provided by AMD. * @@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data) * www.amd.com */ printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); + printk(KERN_ERR PFX "Make sure that your BIOS is up to date" + " and Cool'N'Quiet support is enabled in BIOS setup\n"); return -ENODEV; } @@ -910,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, { int i; u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); - data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; + rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); + data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index dd531cc..8095f86 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, +#endif }; const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 33eae20..898c2f4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) return l3; } -static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, + int index) { int node; @@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) this_leaf->l3 = l3_caches[node]; } +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, unsigned int slot) { - struct pci_dev *dev = this_leaf->l3->dev; - unsigned int reg = 0; + int index; if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (!dev) - return -EINVAL; + index = amd_get_l3_disable_slot(this_leaf->l3, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); - pci_read_config_dword(dev, 0x1BC + slot * 4, ®); - return sprintf(buf, "0x%08x\n", reg); + return sprintf(buf, "FREE\n"); } #define SHOW_CACHE_DISABLE(slot) \ @@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, } } - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, - unsigned int slot) +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, + unsigned long index) { - struct pci_dev *dev = this_leaf->l3->dev; - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - unsigned long val = 0; + int ret = 0; #define SUBCACHE_MASK (3UL << 20) #define SUBCACHE_INDEX 0xfff - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + /* + * check whether this slot is already used or + * the index is already disabled + */ + ret = amd_get_l3_disable_slot(l3, slot); + if (ret >= 0) return -EINVAL; + /* + * check whether the other slot has disabled the + * same index already + */ + if (index == amd_get_l3_disable_slot(l3, !slot)) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((index & SUBCACHE_INDEX) > l3->indices)) + return -EINVAL; + + amd_l3_disable_index(l3, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!dev) + if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; + cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3->indices)) + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - amd_l3_disable_index(this_leaf->l3, cpu, slot, val); - + err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + if (err) { + if (err == -EEXIST) + printk(KERN_WARNING "L3 disable slot %d in use!\n", + slot); + return err; + } return count; } @@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, #else /* CONFIG_CPU_SUP_AMD */ static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; #endif /* CONFIG_CPU_SUP_AMD */ @@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(index, this_leaf); + amd_check_l3_disable(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc425..ed41562 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,7 +51,7 @@ static DEFINE_MUTEX(mce_read_mutex); #define rcu_dereference_check_mce(p) \ - rcu_dereference_check((p), \ + rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ lockdep_is_held(&mce_read_mutex)) @@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); static int default_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { - pr_emerg("No human readable MCE decoding support on this CPU type.\n"); - pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); + pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); return NOTIFY_STOP; } @@ -211,11 +211,11 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { - pr_emerg("RIP%s %02x:<%016Lx> ", + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", m->cs, m->ip); @@ -224,14 +224,14 @@ static void print_mce(struct mce *m) pr_cont("\n"); } - pr_emerg("TSC %llx ", m->tsc); + pr_emerg(HW_ERR "TSC %llx ", m->tsc); if (m->addr) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); /* @@ -241,16 +241,6 @@ static void print_mce(struct mce *m) atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } -static void print_mce_head(void) -{ - pr_emerg("\nHARDWARE ERROR\n"); -} - -static void print_mce_tail(void) -{ - pr_emerg("This is not a software problem!\n"); -} - #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; @@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (atomic_inc_return(&mce_fake_paniced) > 1) return; } - print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; @@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp) apei_err = apei_write_mce(final); } if (cpu_missing) - printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); - print_mce_tail(); + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) - printk(KERN_EMERG "Machine check: %s\n", exp); + pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mce_panic_timeout; panic(msg); } else - printk(KERN_EMERG "Fake kernel panic: %s\n", msg); + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -600,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); add_taint(TAINT_MACHINE_CHECK); } @@ -1220,7 +1209,7 @@ int mce_notify_irq(void) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); + pr_info(HW_ERR "Machine check events logged\n"); return 1; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e4..6fcd093 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot) rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val |= CMCI_EN | CMCI_THRESHOLD; + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); @@ -155,7 +156,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e1a0a3b..c2a8b26 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,15 +34,25 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + /* - * Current thermal throttling state: + * Current thermal event state: */ -struct thermal_state { - bool is_throttled; - +struct _thermal_state { + bool new_event; + int event; u64 next_check; - unsigned long throttle_count; - unsigned long last_throttle_count; + unsigned long count; + unsigned long last_count; +}; + +struct thermal_state { + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; }; static DEFINE_PER_CPU(struct thermal_state, thermal_state); @@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + static SYSDEV_ATTR(_name, 0444, \ + therm_throt_sysdev_show_##_name, \ + NULL) \ -#define define_therm_throt_sysdev_show_func(name) \ +#define define_therm_throt_sysdev_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##name( \ +static ssize_t therm_throt_sysdev_show_##event##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \ ssize_t ret; \ \ preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ + if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).name); \ - else \ + per_cpu(thermal_state, cpu).event.name); \ + } else \ ret = 0; \ preempt_enable(); \ \ return ret; \ } -define_therm_throt_sysdev_show_func(throttle_count); -define_therm_throt_sysdev_one_ro(throttle_count); +define_therm_throt_sysdev_show_func(core_throttle, count); +define_therm_throt_sysdev_one_ro(core_throttle_count); + +define_therm_throt_sysdev_show_func(core_power_limit, count); +define_therm_throt_sysdev_one_ro(core_power_limit_count); + +define_therm_throt_sysdev_show_func(package_throttle, count); +define_therm_throt_sysdev_one_ro(package_throttle_count); + +define_therm_throt_sysdev_show_func(package_power_limit, count); +define_therm_throt_sysdev_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_throttle_count.attr, + &attr_core_throttle_count.attr, NULL }; -static struct attribute_group thermal_throttle_attr_group = { +static struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(bool is_throttled) +static int therm_throt_process(bool new_event, int event, int level) { - struct thermal_state *state; - unsigned int this_cpu; - bool was_throttled; + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + bool old_event; u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - this_cpu = smp_processor_id(); now = get_jiffies_64(); - state = &per_cpu(thermal_state, this_cpu); + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return 0; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return 0; + } else + return 0; - was_throttled = state->is_throttled; - state->is_throttled = is_throttled; + old_event = state->new_event; + state->new_event = new_event; - if (is_throttled) - state->throttle_count++; + if (new_event) + state->count++; if (time_before64(now, state->next_check) && - state->throttle_count != state->last_throttle_count) + state->count != state->last_count) return 0; state->next_check = now + CHECK_INTERVAL; - state->last_throttle_count = state->throttle_count; + state->last_count = state->count; /* if we just entered the thermal event */ - if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); + else + printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); add_taint(TAINT_MACHINE_CHECK); return 1; } - if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); + else + printk(KERN_INFO "CPU%d: %s power limit normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled) /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, - &thermal_throttle_attr_group); + int err; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_core_power_limit_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PTS)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_throttle_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_power_limit_count.attr, + thermal_attr_group.name); + + return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ +/* + * Set up the most two significant bit to notify mce log that this thermal + * event type. + * This is a temp solution. May be changed in the future with mce log + * infrasture. + */ +#define CORE_THROTTLED (0) +#define CORE_POWER_LIMIT ((__u64)1 << 62) +#define PACKAGE_THROTTLED ((__u64)2 << 62) +#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { __u64 msr_val; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) - mce_log_therm_throt_event(msr_val); + + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_POWER_LIMIT + | msr_val); + } } static void unexpected_thermal_interrupt(void) @@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } smp_thermal_vector = intel_thermal_interrupt; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 16f41bb..d944bf6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include <asm/mshyperv.h> struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); static bool __init ms_hyperv_platform(void) { diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b5..c5f59d0 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) unsigned long gran_base, chunk_base, lose_base; char gran_factor, chunk_factor, lose_factor; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", result[i].bad ? "*BAD*" : " ", diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a44..7d28d7d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, { unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; - int cpu; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ - cpu = get_cpu(); + get_cpu(); rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd..01c0f3e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -35,6 +35,7 @@ #include <linux/types.h> /* FIXME: kvm_para.h needs this */ +#include <linux/stop_machine.h> #include <linux/kvm_para.h> #include <linux/uaccess.h> #include <linux/module.h> @@ -143,22 +144,28 @@ struct set_mtrr_data { mtrr_type smp_type; }; +static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); + /** - * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static void ipi_handler(void *info) +static int mtrr_work_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; unsigned long flags; + atomic_dec(&data->count); + while (!atomic_read(&data->gate)) + cpu_relax(); + local_irq_save(flags); atomic_dec(&data->count); - while (!atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ @@ -173,12 +180,13 @@ static void ipi_handler(void *info) } atomic_dec(&data->count); - while (atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); local_irq_restore(flags); #endif + return 0; } static inline int types_compatible(mtrr_type type1, mtrr_type type2) @@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: * - * 1. Send IPI to do the following: + * 1. Queue work to do the following on all processors: * 2. Disable Interrupts * 3. Wait for all procs to do so * 4. Enter no-fill cache mode @@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 15. Enable interrupts. * * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait - * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * of CPUs. As each CPU announces that it started the rendezvous handler by + * decrementing the count, We reset data.count and set the data.gate flag + * allowing all the cpu's to proceed with the work. As each cpu disables + * interrupts, it'll decrement data.count once. We wait until it hits 0 and + * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they + * are waiting for that flag to be cleared. Once it's cleared, each * CPU goes through the transition of updating MTRRs. * The CPU vendors may each do it differently, * so we call mtrr_if->set() callback and let them take care of it. * When they're done, they again decrement data->count and wait for data.gate - * to be reset. + * to be set. * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * @@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ { struct set_mtrr_data data; unsigned long flags; + int cpu; + + preempt_disable(); data.smp_reg = reg; data.smp_base = base; @@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.gate, 0); /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 0) != 0) - panic("mtrr: timed out waiting for other CPUs\n"); + for_each_online_cpu(cpu) { + struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); + + if (cpu == smp_processor_id()) + continue; + + stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); + } - local_irq_save(flags); while (atomic_read(&data.count)) cpu_relax(); @@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ smp_wmb(); atomic_set(&data.gate, 1); + local_irq_save(flags); + + while (atomic_read(&data.count)) + cpu_relax(); + + /* Ok, reset count and toggle gate */ + atomic_set(&data.count, num_booting_cpus() - 1); + smp_wmb(); + atomic_set(&data.gate, 0); + /* Do our MTRR business */ /* @@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate, 0); + atomic_set(&data.gate, 1); /* * Wait here for everyone to have seen the gate change @@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ cpu_relax(); local_irq_restore(flags); + preempt_enable(); } /** diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 0000000..34b4dad --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,63 @@ +/* + * Routines to indentify additional cpu features that are scattered in + * cpuid space. + */ +#include <linux/cpu.h> + +#include <asm/pat.h> +#include <asm/processor.h> + +#include <asm/apic.h> + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, + { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } + }; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c index 10fa568..4397e98 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/topology.c @@ -1,62 +1,14 @@ /* - * Routines to indentify additional cpu features that are scattered in - * cpuid space. + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. */ -#include <linux/cpu.h> +#include <linux/cpu.h> +#include <asm/apic.h> #include <asm/pat.h> #include <asm/processor.h> -#include <asm/apic.h> - -struct cpuid_bit { - u16 feature; - u8 reg; - u8 bit; - u32 level; -}; - -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) -{ - u32 max_level; - u32 regs[4]; - const struct cpuid_bit *cb; - - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, - { 0, 0, 0, 0 } - }; - - for (cb = cpuid_bits; cb->feature; cb++) { - - /* Verify that the level is valid */ - max_level = cpuid_eax(cb->level & 0xffff0000); - if (max_level < cb->level || - max_level > (cb->level | 0xffff)) - continue; - - cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], - ®s[CR_ECX], ®s[CR_EDX]); - - if (regs[cb->reg] & (1 << cb->bit)) - set_cpu_cap(c, cb->feature); - } -} - /* leaf 0xb SMT level */ #define SMT_LEVEL 0 diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b9d1ff5..227b0448 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -51,7 +51,7 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; + uint64_t tsc_hz, lpj; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void) printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + return tsc_hz; } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141..227d009 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -611,14 +611,14 @@ ldt_ss: * compensating for the offset by changing to the ESPFIX segment with * a base address that matches for the difference. */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ sub %eax, %edx /* offset (low word is 0) */ - PER_CPU(gdt_page, %ebx) shr $16, %edx - mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ - mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 push %eax /* new kernel esp */ @@ -791,9 +791,8 @@ ptregs_clone: * normal stack and adjusts ESP with the matching offset. */ /* fixup the stack */ - PER_CPU(gdt_page, %ebx) - mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ - mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS @@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error) .balign 4 .long 661b .long 663f - .byte X86_FEATURE_XMM + .word X86_FEATURE_XMM .byte 662b-661b .byte 664f-663f .previous @@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback) .previous ENDPROC(xen_failsafe_callback) +BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, + xen_evtchn_do_upcall) + #endif /* CONFIG_XEN */ #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4db7c4d..c5ea5cd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1065,6 +1065,7 @@ ENTRY(\sym) END(\sym) .endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) INTR_FRAME @@ -1076,10 +1077,9 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %r12) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) @@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) +apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ + xen_hvm_callback_vector xen_evtchn_do_upcall + #endif /* CONFIG_XEN */ /* diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index b2e2460..784360c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -20,7 +20,7 @@ static void __init i386_default_early_setup(void) { - /* Initilize 32bit specific setup functions */ + /* Initialize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b..ff4c453 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -131,6 +131,12 @@ ENTRY(startup_32) movsl 1: +#ifdef CONFIG_OLPC_OPENFIRMWARE + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f1..239046b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) * init data section till per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq initial_gs(%rip),%rax - movq %rax,%rdx - shrq $32,%rdx + movl initial_gs(%rip),%eax + movl initial_gs+4(%rip),%edx wrmsr /* esi is pointer to real mode structure with interesting info. diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba390d7..351f9c0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -16,7 +16,6 @@ #include <asm/hpet.h> #define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ @@ -583,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) * scaled math multiplication factor for nanosecond to hpet tick * conversion. */ - hpet_freq = 1000000000000000ULL; + hpet_freq = FSEC_PER_SEC; do_div(hpet_freq, hpet_period); evt->mult = div_sc((unsigned long) hpet_freq, NSEC_PER_SEC, evt->shift); @@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 @@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; + u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -832,9 +831,15 @@ static int hpet_clocksource_register(void) * mult = (hpet_period * 2^shift)/10^6 * mult = (hpet_period << shift)/FSEC_PER_NSEC */ - clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); - clocksource_register(&clocksource_hpet); + /* Need to convert hpet_period (fsec/cyc) to cyc/sec: + * + * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) + * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period + */ + hpet_freq = FSEC_PER_SEC; + do_div(hpet_freq, hpet_period); + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); return 0; } diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b..1f11f5c 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void) stts(); } -void __cpuinit init_thread_xstate(void) +static void __cpuinit init_thread_xstate(void) { + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + if (!HAVE_HWFP) { xstate_size = sizeof(struct i387_soft_struct); return; } - if (cpu_has_xsave) { - xsave_cntxt_init(); - return; - } - if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ + void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); @@ -93,21 +94,26 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - /* - * Boot processor to setup the FP and extended state context info. - */ if (!smp_processor_id()) init_thread_xstate(); - xsave_init(); mxcsr_feature_mask_init(); /* clean state in init */ current_thread_info()->status = 0; clear_used_math(); } -#endif /* CONFIG_X86_64 */ -static void fpu_finit(struct fpu *fpu) +#else /* CONFIG_X86_64 */ + +void __cpuinit fpu_init(void) +{ + if (!smp_processor_id()) + init_thread_xstate(); +} + +#endif /* CONFIG_X86_32 */ + +void fpu_finit(struct fpu *fpu) { #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { @@ -132,6 +138,7 @@ static void fpu_finit(struct fpu *fpu) fp->fos = 0xffff0000u; } } +EXPORT_SYMBOL_GPL(fpu_finit); /* * The _current_ task is using the FPU for the first time @@ -190,6 +197,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); } @@ -207,6 +216,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); @@ -446,6 +457,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, -1); } + sanitize_i387_state(target); + if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); return 0; @@ -467,6 +480,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); @@ -533,6 +548,9 @@ static int save_i387_xsave(void __user *buf) struct _fpstate_ia32 __user *fx = buf; int err = 0; + + sanitize_i387_state(tsk); + /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 01ab17a..ef10940 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -49,55 +49,94 @@ #include <asm/system.h> #include <asm/apic.h> -/** - * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs - * @gdb_regs: A pointer to hold the registers in the order GDB wants. - * @regs: The &struct pt_regs of the current process. - * - * Convert the pt_regs in @regs into the format for registers that - * GDB expects, stored in @gdb_regs. - */ -void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, + { "fs", 4, -1 }, + { "gs", 4, -1 }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, dx) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, #endif - gdb_regs[GDB_AX] = regs->ax; - gdb_regs[GDB_BX] = regs->bx; - gdb_regs[GDB_CX] = regs->cx; - gdb_regs[GDB_DX] = regs->dx; - gdb_regs[GDB_SI] = regs->si; - gdb_regs[GDB_DI] = regs->di; - gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PC] = regs->ip; +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( #ifdef CONFIG_X86_32 - gdb_regs[GDB_PS] = regs->flags; - gdb_regs[GDB_DS] = regs->ds; - gdb_regs[GDB_ES] = regs->es; - gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_FS] = 0xFFFF; - gdb_regs[GDB_GS] = 0xFFFF; - if (user_mode_vm(regs)) { - gdb_regs[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; - } else { - gdb_regs[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; } -#else - gdb_regs[GDB_R8] = regs->r8; - gdb_regs[GDB_R9] = regs->r9; - gdb_regs[GDB_R10] = regs->r10; - gdb_regs[GDB_R11] = regs->r11; - gdb_regs[GDB_R12] = regs->r12; - gdb_regs[GDB_R13] = regs->r13; - gdb_regs[GDB_R14] = regs->r14; - gdb_regs[GDB_R15] = regs->r15; - gdb_regs32[GDB_PS] = regs->flags; - gdb_regs32[GDB_CS] = regs->cs; - gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + + switch (regno) { +#ifdef CONFIG_X86_32 + case GDB_SS: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = __KERNEL_DS; + break; + case GDB_SP: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = kernel_stack_pointer(regs); + break; + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; #endif + } + return dbg_reg_def[regno].name; } /** @@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_SP] = p->thread.sp; } -/** - * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. - * @gdb_regs: A pointer to hold the registers we've received from GDB. - * @regs: A pointer to a &struct pt_regs to hold these values in. - * - * Convert the GDB regs in @gdb_regs into the pt_regs, and store them - * in @regs. - */ -void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) -{ -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; -#endif - regs->ax = gdb_regs[GDB_AX]; - regs->bx = gdb_regs[GDB_BX]; - regs->cx = gdb_regs[GDB_CX]; - regs->dx = gdb_regs[GDB_DX]; - regs->si = gdb_regs[GDB_SI]; - regs->di = gdb_regs[GDB_DI]; - regs->bp = gdb_regs[GDB_BP]; - regs->ip = gdb_regs[GDB_PC]; -#ifdef CONFIG_X86_32 - regs->flags = gdb_regs[GDB_PS]; - regs->ds = gdb_regs[GDB_DS]; - regs->es = gdb_regs[GDB_ES]; - regs->cs = gdb_regs[GDB_CS]; -#else - regs->r8 = gdb_regs[GDB_R8]; - regs->r9 = gdb_regs[GDB_R9]; - regs->r10 = gdb_regs[GDB_R10]; - regs->r11 = gdb_regs[GDB_R11]; - regs->r12 = gdb_regs[GDB_R12]; - regs->r13 = gdb_regs[GDB_R13]; - regs->r14 = gdb_regs[GDB_R14]; - regs->r15 = gdb_regs[GDB_R15]; - regs->flags = gdb_regs32[GDB_PS]; - regs->cs = gdb_regs32[GDB_CS]; - regs->ss = gdb_regs32[GDB_SS]; -#endif -} - static struct hw_breakpoint { unsigned enabled; unsigned long addr; int len; int type; struct perf_event **pev; -} breakinfo[4]; +} breakinfo[HBP_NUM]; static unsigned long early_dr7; @@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void) { int breakno; - for (breakno = 0; breakno < 4; breakno++) { + for (breakno = 0; breakno < HBP_NUM; breakno++) { struct perf_event *bp; struct arch_hw_breakpoint *info; int val; @@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (breakinfo[i].addr == addr && breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; if (hw_break_release_slot(i)) { @@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void) int cpu = raw_smp_processor_id(); struct perf_event *bp; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); @@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (!breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; switch (bptype) { @@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; if (dbg_is_early) { @@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, { unsigned long addr; char *ptr; - int newPC; switch (remcomInBuffer[0]) { case 'c': @@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, linux_regs->ip = addr; case 'D': case 'k': - newPC = linux_regs->ip; - /* clear the trace bit */ linux_regs->flags &= ~X86_EFLAGS_TF; atomic_set(&kgdb_cpu_doing_single_step, -1); @@ -645,7 +640,7 @@ void kgdb_arch_late(void) attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 5915e0b3..79ae681 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,8 +25,34 @@ #include <asm/i8259.h> #include <asm/apb_timer.h> +/* + * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, + * cmdline option x86_mrst_timer can be used to override the configuration + * to prefer one or the other. + * at runtime, there are basically three timer configurations: + * 1. per cpu apbt clock only + * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only + * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. + * + * by default (without cmdline option), platform code first detects cpu type + * to see if we are on lincroft or penwell, then set up both lapic or apbt + * clocks accordingly. + * i.e. by default, medfield uses configuration #2, moorestown uses #1. + * config #3 is supported but not recommended on medfield. + * + * rating and feature summary: + * lapic (with C3STOP) --------- 100 + * apbt (always-on) ------------ 110 + * lapic (always-on,ARAT) ------ 150 + */ + +__cpuinitdata enum mrst_timer_options mrst_timer_options; + static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +enum mrst_cpu_type __mrst_cpu_chip; +EXPORT_SYMBOL_GPL(__mrst_cpu_chip); + int sfi_mtimer_num; struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; @@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } -/* - * the secondary clock in Moorestown can be APBT or LAPIC clock, default to - * APBT but cmdline option can also override it. - */ -static void __cpuinit mrst_setup_secondary_clock(void) -{ - /* restore default lapic clock if disabled by cmdline */ - if (disable_apbt_percpu) - return setup_secondary_APIC_clock(); - apbt_setup_secondary_clock(); -} - static unsigned long __init mrst_calibrate_tsc(void) { unsigned long flags, fast_calibrate; @@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void) void __init mrst_time_init(void) { + switch (mrst_timer_options) { + case MRST_TIMER_APBT_ONLY: + break; + case MRST_TIMER_LAPIC_APBT: + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + break; + default: + if (!boot_cpu_has(X86_FEATURE_ARAT)) + break; + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + return; + } + /* we need at least one APB timer */ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); pre_init_apic_IRQ0(); apbt_time_init(); @@ -205,16 +234,21 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -/* - * if we use per cpu apb timer, the bootclock already setup. if we use lapic - * timer and one apbt timer for broadcast, we need to set up lapic boot clock. - */ -static void __init mrst_setup_boot_clock(void) +void __cpuinit mrst_arch_setup(void) { - pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); - if (disable_apbt_percpu) - setup_boot_APIC_clock(); -}; + if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) + __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + else { + pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + } + pr_debug("Moorestown CPU %s identified\n", + (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + "Lincroft" : "Penwell"); +} /* MID systems don't have i8042 controller */ static int mrst_i8042_detect(void) @@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void) x86_init.resources.reserve_resources = x86_init_noop; x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; - x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + x86_init.oem.arch_setup = mrst_arch_setup; + + x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_platform.i8042_detect = mrst_i8042_detect; @@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void) x86_init.mpparse.get_smp_config = x86_init_uint_noop; } + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + mrst_timer_options = MRST_TIMER_APBT_ONLY; + else if (strcmp("lapic_and_apbt", arg) == 0) + mrst_timer_options = MRST_TIMER_LAPIC_APBT; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160..0e0cdde 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -21,10 +21,7 @@ #include <asm/geode.h> #include <asm/setup.h> #include <asm/olpc.h> - -#ifdef CONFIG_OPEN_FIRMWARE -#include <asm/ofw.h> -#endif +#include <asm/olpc_ofw.h> struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); @@ -145,7 +142,7 @@ restart: * The OBF flag will sometimes misbehave due to what we believe * is a hardware quirk.. */ - printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); + pr_devel("olpc-ec: running cmd 0x%x\n", cmd); outb(cmd, 0x6c); if (wait_on_ibf(0x6c, 0)) { @@ -162,8 +159,7 @@ restart: " EC accept data!\n"); goto err; } - printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", - inbuf[i]); + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); outb(inbuf[i], 0x68); } } @@ -176,8 +172,7 @@ restart: goto restart; } outbuf[i] = inb(0x68); - printk(KERN_DEBUG "olpc-ec: received 0x%x\n", - outbuf[i]); + pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); } } @@ -188,14 +183,15 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -#ifdef CONFIG_OPEN_FIRMWARE +#ifdef CONFIG_OLPC_OPENFIRMWARE static void __init platform_detect(void) { size_t propsize; __be32 rev; + const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + void *res[] = { &propsize }; - if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, - &propsize) || propsize != 4) { + if (olpc_ofw("getprop", args, res) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); rev = cpu_to_be32(0); } diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 0000000..3218aa7 --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c @@ -0,0 +1,106 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <asm/page.h> +#include <asm/setup.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/olpc_ofw.h> + +/* address of OFW callback interface; will be NULL if OFW isn't found */ +static int (*olpc_ofw_cif)(int *); + +/* page dir entry containing OFW's pgdir table; filled in by head_32.S */ +u32 olpc_ofw_pgd __initdata; + +static DEFINE_SPINLOCK(ofw_lock); + +#define MAXARGS 10 + +void __init setup_olpc_ofw_pgd(void) +{ + pgd_t *base, *ofw_pde; + + if (!olpc_ofw_cif) + return; + + /* fetch OFW's PDE */ + base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); + if (!base) { + printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); + olpc_ofw_cif = NULL; + return; + } + ofw_pde = &base[OLPC_OFW_PDE_NR]; + + /* install OFW's PDE permanently into the kernel's pgtable */ + set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + /* implicit optimization barrier here due to uninline function return */ + + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); +} + +int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, + void **res) +{ + int ofw_args[MAXARGS + 3]; + unsigned long flags; + int ret, i, *p; + + BUG_ON(nr_args + nr_res > MAXARGS); + + if (!olpc_ofw_cif) + return -EIO; + + ofw_args[0] = (int)name; + ofw_args[1] = nr_args; + ofw_args[2] = nr_res; + + p = &ofw_args[3]; + for (i = 0; i < nr_args; i++, p++) + *p = (int)args[i]; + + /* call into ofw */ + spin_lock_irqsave(&ofw_lock, flags); + ret = olpc_ofw_cif(ofw_args); + spin_unlock_irqrestore(&ofw_lock, flags); + + if (!ret) { + for (i = 0; i < nr_res; i++, p++) + *((int *)res[i]) = *p; + } + + return ret; +} +EXPORT_SYMBOL_GPL(__olpc_ofw); + +/* OFW cif _should_ be above this address */ +#define OFW_MIN 0xff000000 + +/* OFW starts on a 1MB boundary */ +#define OFW_BOUND (1<<20) + +void __init olpc_ofw_detect(void) +{ + struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; + unsigned long start; + + /* ensure OFW booted us by checking for "OFW " string */ + if (hdr->ofw_magic != OLPC_OFW_SIG) + return; + + olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; + + if ((unsigned long)olpc_ofw_cif < OFW_MIN) { + printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", + (unsigned long)olpc_ofw_cif); + olpc_ofw_cif = NULL; + return; + } + + /* determine where OFW starts in memory */ + start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); + printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", + (unsigned long)olpc_ofw_cif, (-start) >> 20); + reserve_top_address(-start); +} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 4b7e3d8..9f07cfc 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -13,6 +13,7 @@ #include <asm/calgary.h> #include <asm/amd_iommu.h> #include <asm/x86_init.h> +#include <asm/xen/swiotlb-xen.h> static int forbid_dac __read_mostly; @@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); - if (pci_swiotlb_detect()) + if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) goto out; gart_iommu_hole_init(); @@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); out: + pci_xen_swiotlb_init(); + pci_swiotlb_init(); } @@ -296,7 +299,7 @@ static int __init pci_iommu_init(void) #endif x86_init.iommu.iommu_init(); - if (swiotlb) { + if (swiotlb || xen_swiotlb) { printk(KERN_INFO "PCI-DMA: " "Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_print_info(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 787572d..d401f1d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { @@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } -/* - * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. - * For more information see - * - Erratum #400 for NPT family 0xf and family 0x10 CPUs - * - Erratum #365 for family 0x11 (not affected because C1e not in use) - */ -static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) -{ - u64 val; - if (c->x86_vendor != X86_VENDOR_AMD) - goto no_c1e_idle; - - /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0F && c->x86_model >= 0x40) - return 1; - - if (c->x86 == 0x10) { - /* - * check OSVW bit for CPUs that are not affected - * by erratum #400 - */ - if (cpu_has(c, X86_FEATURE_OSVW)) { - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); - if (val >= 2) { - rdmsrl(MSR_AMD64_OSVW_STATUS, val); - if (!(val & BIT(1))) - goto no_c1e_idle; - } - } - return 1; - } - -no_c1e_idle: - return 0; -} +bool c1e_detected; +EXPORT_SYMBOL(c1e_detected); static cpumask_var_t c1e_mask; -static int c1e_detected; void c1e_remove_cpu(int cpu) { @@ -584,12 +551,12 @@ static void c1e_idle(void) u32 lo, hi; rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { - c1e_detected = 1; + c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } @@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; - } else if (check_c1e_idle(c)) { + } else if (cpu_has_amd_erratum(amd_erratum_400)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ printk(KERN_INFO "using C1E aware idle routine\n"); pm_idle = c1e_idle; } else diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4ae4ac..b008e78 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -102,6 +102,7 @@ #include <asm/paravirt.h> #include <asm/hypervisor.h> +#include <asm/olpc_ofw.h> #include <asm/percpu.h> #include <asm/topology.h> @@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p) /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); + /* OFW also may relocate the fixmap */ + olpc_ofw_detect(); + early_trap_init(); early_cpu_init(); early_ioremap_init(); + setup_olpc_ofw_pgd(); + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2..a5e928b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up() || current_is_keventd()) - c_idle.work.func(&c_idle.work); - else { - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - } + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); @@ -816,6 +812,13 @@ do_rest: if (cpumask_test_cpu(cpu, cpu_callin_mask)) break; /* It has booted */ udelay(100); + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); } if (cpumask_test_cpu(cpu, cpu_callin_mask)) diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b37293..b35786d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,6 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_fanotify_init + .long sys_fanotify_mark + .long sys_prlimit64 /* 340 */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91a..ce8e502 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = { .read = read_tsc, .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 @@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void) static void __init init_tsc_clocksource(void) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register(&clocksource_tsc); + clocksource_register_khz(&clocksource_tsc, tsc_khz); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a..56a8c2a 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S @@ -31,6 +31,7 @@ */ #include <asm/cpufeature.h> +#include <asm/msr-index.h> verify_cpu: pushfl # Save caller passed flags @@ -88,7 +89,7 @@ verify_cpu_sse_test: je verify_cpu_sse_ok test %di,%di jz verify_cpu_no_longmode # only try to force SSE on AMD - movl $0xc0010015,%ecx # HWCR + movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab..dcbb28c 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,8 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, - u32 mult) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { unsigned long flags; @@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - struct timeval tv; + unsigned seq; time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, NULL); - result = tv.tv_sec; + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + result = __vsyscall_gtod_data.wall_time_sec; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + if (t) *t = result; return result; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc..9c253bd 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -16,11 +16,88 @@ */ u64 pcntxt_mask; +/* + * Represents init state for the supported extended state. + */ +static struct xsave_struct *init_xstate_buf; + struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION struct _fpx_sw_bytes fx_sw_reserved_ia32; #endif +static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; + +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + u64 xstate_bv; + int feature_bit = 0x2; + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + + if (!fx) + return; + + BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + + xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is upto date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], sizeof(struct _fpx_sw_bytes)); - if (err) - return err; + return -EFAULT; /* * First Magic check failed. */ if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) - return -1; + return -EINVAL; /* * Check for error scenarios. @@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, if (fx_sw_user->xstate_size < min_xstate_size || fx_sw_user->xstate_size > xstate_size || fx_sw_user->xstate_size > fx_sw_user->extended_size) - return -1; + return -EINVAL; err = __get_user(magic2, (__u32 *) (((void *)fpstate) + fx_sw_user->extended_size - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return err; /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ - if (err || magic2 != FP_XSTATE_MAGIC2) - return -1; + if (magic2 != FP_XSTATE_MAGIC2) + return -EFAULT; return 0; } @@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf) return 0; if (task_thread_info(tsk)->status & TS_USEDFPU) { - /* - * Start with clearing the user buffer. This will present a - * clean context for the bytes not touched by the fxsave/xsave. - */ - err = __clear_user(buf, sig_xstate_size); - if (err) - return err; - if (use_xsave()) err = xsave_user(buf); else @@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf) task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } else { + sanitize_i387_state(tsk); if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, xstate_size)) return -1; @@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf) * init the state skipped by the user. */ mask = pcntxt_mask & ~mask; - - xrstor_state(init_xstate_buf, mask); + if (unlikely(mask)) + xrstor_state(init_xstate_buf, mask); return 0; @@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void) #endif } -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct *init_xstate_buf; - #ifdef CONFIG_X86_64 unsigned int sig_xstate_size = sizeof(struct _fpstate); #endif @@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -void __cpuinit xsave_init(void) +static inline void xstate_enable(void) { - if (!cpu_has_xsave) - return; - set_in_cr4(X86_CR4_OSXSAVE); - - /* - * Enable all the features that the HW is capable of - * and the Linux kernel is aware of. - */ xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void __init setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + +/* * setup the xstate image representing the init state */ static void __init setup_xstate_init(void) { + setup_xstate_features(); + + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + + clts(); + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state(init_xstate_buf, -1); + stts(); } /* * Enable and initialize the xsave feature. */ -void __ref xsave_cntxt_init(void) +static void __init xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { @@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xsave_init(); + + xstate_enable(); /* * Recompute the context size for enabled features */ - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; update_regset_xstate_info(xstate_size, pcntxt_mask); @@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void) "cntxt size 0x%x\n", pcntxt_mask, xstate_size); } + +/* + * For the very first instance, this calls xstate_enable_boot_cpu(); + * for all subsequent instances, this calls xstate_enable(). + * + * This is somewhat obfuscated due to the lack of powerful enough + * overrides for the section checks. + */ +void __cpuinit xsave_init(void) +{ + static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; + void (*this_func)(void); + + if (!cpu_has_xsave) + return; + + this_func = next_func; + next_func = xstate_enable; + this_func(); +} |