diff options
Diffstat (limited to 'arch/x86/kernel')
105 files changed, 4135 insertions, 2751 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f3477bb..832cb83 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,6 +24,10 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) +GCOV_PROFILE_vsyscall_64.o := n +GCOV_PROFILE_hpet.o := n +GCOV_PROFILE_tsc.o := n +GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o @@ -48,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6310861..67e929b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -44,11 +44,7 @@ static int __initdata acpi_force = 0; u32 acpi_rsdt_forced; -#ifdef CONFIG_ACPI -int acpi_disabled = 0; -#else -int acpi_disabled = 1; -#endif +int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 @@ -122,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size) early_iounmap(map, size); } -#ifdef CONFIG_PCI_MMCONFIG - -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -struct acpi_mcfg_allocation *pci_mmcfg_config; -int pci_mmcfg_config_num; - -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) -{ - if (!strcmp(mcfg->header.oem_id, "SGI")) - acpi_mcfg_64bit_base_addr = TRUE; - - return 0; -} - -int __init acpi_parse_mcfg(struct acpi_table_header *header) -{ - struct acpi_table_mcfg *mcfg; - unsigned long i; - int config_size; - - if (!header) - return -EINVAL; - - mcfg = (struct acpi_table_mcfg *)header; - - /* how many config structures do we have */ - pci_mmcfg_config_num = 0; - i = header->length - sizeof(struct acpi_table_mcfg); - while (i >= sizeof(struct acpi_mcfg_allocation)) { - ++pci_mmcfg_config_num; - i -= sizeof(struct acpi_mcfg_allocation); - }; - if (pci_mmcfg_config_num == 0) { - printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); - return -ENODEV; - } - - config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); - pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); - if (!pci_mmcfg_config) { - printk(KERN_WARNING PREFIX - "No memory for MCFG config tables\n"); - return -ENOMEM; - } - - memcpy(pci_mmcfg_config, &mcfg[1], config_size); - - acpi_mcfg_oem_check(mcfg); - - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && - !acpi_mcfg_64bit_base_addr) { - printk(KERN_ERR PREFIX - "MMCONFIG not in low 4GB of memory\n"); - kfree(pci_mmcfg_config); - pci_mmcfg_config_num = 0; - return -ENODEV; - } - } - - return 0; -} -#endif /* CONFIG_PCI_MMCONFIG */ - #ifdef CONFIG_X86_LOCAL_APIC static int __init acpi_parse_madt(struct acpi_table_header *table) { @@ -903,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void) extern int es7000_plat; #endif -static struct { - int gsi_base; - int gsi_end; -} mp_ioapic_routing[MAX_IO_APICS]; - -int mp_find_ioapic(int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_base) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; -} - -int mp_find_ioapic_pin(int ioapic, int gsi) -{ - if (WARN_ON(ioapic == -1)) - return -1; - if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end)) - return -1; - - return gsi - mp_ioapic_routing[ioapic].gsi_base; -} - -static u8 __init uniq_ioapic_id(u8 id) -{ -#ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif -} - -static int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); - return 1; - } - return 0; -} - -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) -{ - int idx = 0; - - if (bad_ioapic(address)) - return; - - idx = nr_ioapics; - - mp_ioapics[idx].type = MP_IOAPIC; - mp_ioapics[idx].flags = MPC_APIC_USABLE; - mp_ioapics[idx].apicaddr = address; - - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].apicid = uniq_ioapic_id(id); - mp_ioapics[idx].apicver = io_apic_get_version(idx); - - /* - * Build basic GSI lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI GSIs). - */ - mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, - mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - nr_ioapics++; -} - int __init acpi_probe_gsi(void) { int idx; @@ -1017,7 +847,7 @@ int __init acpi_probe_gsi(void) max_gsi = 0; for (idx = 0; idx < nr_ioapics; idx++) { - gsi = mp_ioapic_routing[idx].gsi_end; + gsi = mp_gsi_routing[idx].gsi_end; if (gsi > max_gsi) max_gsi = gsi; @@ -1249,9 +1079,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) * If MPS is present, it will handle them, * otherwise the system will stay in PIC mode */ - if (acpi_disabled || acpi_noirq) { + if (acpi_disabled || acpi_noirq) return -ENODEV; - } if (!cpu_has_apic) return -ENODEV; @@ -1519,14 +1348,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, { .callback = force_acpi_ht, - .ident = "ASUS P4B266", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P4B266"), - }, - }, - { - .callback = force_acpi_ht, .ident = "ASUS P2B-DS", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bbbe4bb..8c44c23 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, flags->bm_check = 1; else if (c->x86_vendor == X86_VENDOR_INTEL) { /* - * Today all CPUs that support C3 share cache. - * TBD: This needs to look at cache shared map, once - * multi-core detection patch makes to the base. + * Today all MP CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. */ flags->bm_check = 1; } + + /* + * On all recent Intel platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state on + * P4, Core and beyond CPUs + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index 7c074ee..d296f4a 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) return; } + /* Initialize _PDC data based on the CPU vendor */ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) { @@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) } EXPORT_SYMBOL(arch_acpi_processor_init_pdc); + +void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) +{ + if (pr->pdc) { + kfree(pr->pdc->pointer->buffer.pointer); + kfree(pr->pdc->pointer); + kfree(pr->pdc); + pr->pdc = NULL; + } +} + +EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc); diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 167bc16..6a564ac 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -42,6 +42,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \ $(call cc-option, -mpreferred-stack-boundary=2) KBUILD_CFLAGS += $(call cc-option, -m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +GCOV_PROFILE := n WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f576587..de7353c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/stringify.h> #include <linux/kprobes.h> #include <linux/mm.h> #include <linux/vmalloc.h> @@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly); #define smp_alt_once 1 #endif -static int debug_alternative; +static int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { @@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str) __setup("noreplace-smp", setup_noreplace_smp); #ifdef CONFIG_PARAVIRT -static int noreplace_paravirt = 0; +static int __initdata_or_module noreplace_paravirt = 0; static int __init setup_noreplace_paravirt(char *str) { @@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) -#ifdef GENERIC_NOP1 +#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) /* Use inline assembly to define this because the nops are defined as inline assembly strings in the include files and we cannot get them easily into strings. */ -asm("\t.section .rodata, \"a\"\nintelnops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 GENERIC_NOP7 GENERIC_NOP8 "\t.previous"); extern const unsigned char intelnops[]; -static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +intel_nops[ASM_NOP_MAX+1] = { NULL, intelnops, intelnops + 1, @@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { #endif #ifdef K8_NOP1 -asm("\t.section .rodata, \"a\"\nk8nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 K8_NOP7 K8_NOP8 "\t.previous"); extern const unsigned char k8nops[]; -static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k8_nops[ASM_NOP_MAX+1] = { NULL, k8nops, k8nops + 1, @@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { }; #endif -#ifdef K7_NOP1 -asm("\t.section .rodata, \"a\"\nk7nops: " +#if defined(K7_NOP1) && !defined(CONFIG_X86_64) +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 K7_NOP7 K7_NOP8 "\t.previous"); extern const unsigned char k7nops[]; -static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k7_nops[ASM_NOP_MAX+1] = { NULL, k7nops, k7nops + 1, @@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { #endif #ifdef P6_NOP1 -asm("\t.section .rodata, \"a\"\np6nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 P6_NOP7 P6_NOP8 "\t.previous"); extern const unsigned char p6nops[]; -static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +p6_nops[ASM_NOP_MAX+1] = { NULL, p6nops, p6nops + 1, @@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_has(X86_FEATURE_NOPL)) @@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void) #else /* CONFIG_X86_64 */ -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_has(X86_FEATURE_K8)) return k8_nops; @@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void) #endif /* CONFIG_X86_64 */ /* Use this to add nops to a buffer, then text_poke the whole buffer. */ -void add_nops(void *insns, unsigned int len) +static void __init_or_module add_nops(void *insns, unsigned int len) { const unsigned char *const *noptable = find_nop_table(); @@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len) len -= noplen; } } -EXPORT_SYMBOL_GPL(add_nops); extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; +static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with @@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[]; APs have less capabilities than the boot processor are not handled. Tough. Make sure you disable such features by hand. */ -void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) { struct alt_instr *a; char insnbuf[MAX_PATCH_LEN]; @@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules); static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ -void alternatives_smp_module_add(struct module *mod, char *name, - void *locks, void *locks_end, - void *text, void *text_end) +void __init_or_module alternatives_smp_module_add(struct module *mod, + char *name, + void *locks, void *locks_end, + void *text, void *text_end) { struct smp_alt_module *smp; @@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name, mutex_unlock(&smp_alt); } -void alternatives_smp_module_del(struct module *mod) +void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; @@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) +void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) { struct paravirt_patch_site *p; char insnbuf[MAX_PATCH_LEN]; @@ -485,13 +492,14 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *text_poke_early(void *addr, const void *opcode, size_t len) +static void *__init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9372f04..98f230f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); -#ifdef CONFIG_IOMMU_API +/* + * Domain for untranslated devices - only allocated + * if iommu=pt passed on kernel cmd line. + */ +static struct protection_domain *pt_domain; + static struct iommu_ops amd_iommu_ops; -#endif /* * general struct to manage commands send to an IOMMU @@ -55,16 +59,16 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 - **pte_page, gfp_t gfp); +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, int end_lvl, + u64 **pte_page, gfp_t gfp); static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages); - -#ifndef BUS_NOTIFY_UNBOUND_DRIVER -#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 -#endif +static void reset_iommu_command_buffer(struct amd_iommu *iommu); +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size); +static void update_domain(struct protection_domain *domain); #ifdef CONFIG_AMD_IOMMU_STATS @@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu) * ****************************************************************************/ -static void iommu_print_event(void *__evt) +static void dump_dte_entry(u16 devid) +{ + int i; + + for (i = 0; i < 8; ++i) + pr_err("AMD-Vi: DTE[%d]: %08x\n", i, + amd_iommu_dev_table[devid].data[i]); +} + +static void dump_command(unsigned long phys_addr) +{ + struct iommu_cmd *cmd = phys_to_virt(phys_addr); + int i; + + for (i = 0; i < 4; ++i) + pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); +} + +static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { u32 *event = __evt; int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; @@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt) int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; u64 address = (u64)(((u64)event[3]) << 32) | event[2]; - printk(KERN_ERR "AMD IOMMU: Event logged ["); + printk(KERN_ERR "AMD-Vi: Event logged ["); switch (type) { case EVENT_TYPE_ILL_DEV: @@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt) "address=0x%016llx flags=0x%04x]\n", PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), address, flags); + dump_dte_entry(devid); break; case EVENT_TYPE_IO_FAULT: printk("IO_PAGE_FAULT device=%02x:%02x.%x " @@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + reset_iommu_command_buffer(iommu); + dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: printk("COMMAND_HARDWARE_ERROR address=0x%016llx " @@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); while (head != tail) { - iommu_print_event(iommu->evt_buf + head); + iommu_print_event(iommu, iommu->evt_buf + head); head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; } @@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) - panic("AMD IOMMU: Completion wait loop failed\n"); + if (unlikely(i == EXIT_LOOP_COUNT)) { + spin_unlock(&iommu->lock); + reset_iommu_command_buffer(iommu); + spin_lock(&iommu->lock); + } } /* @@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) } /* + * This function flushes one domain on one IOMMU + */ +static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +{ + struct iommu_cmd cmd; + unsigned long flags; + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + flush_domain_on_iommu(iommu, i); + } + +} + +/* * This function is used to flush the IO/TLB for a given protection domain * on every IOMMU in the system */ static void iommu_flush_domain(u16 domid) { - unsigned long flags; struct amd_iommu *iommu; - struct iommu_cmd cmd; INC_STATS_COUNTER(domain_flush_all); - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); - - for_each_iommu(iommu) { - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); - } + for_each_iommu(iommu) + flush_domain_on_iommu(iommu, domid); } void amd_iommu_flush_all_domains(void) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + flush_all_domains_on_iommu(iommu); +} + +static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +{ int i; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (iommu != amd_iommu_rlookup_table[i]) continue; - iommu_flush_domain(i); + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); } } -void amd_iommu_flush_all_devices(void) +static void flush_devices_by_domain(struct protection_domain *domain) { struct amd_iommu *iommu; int i; for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] == NULL) + if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || + (amd_iommu_pd_table[i] != domain)) continue; iommu = amd_iommu_rlookup_table[i]; @@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void) } } +static void reset_iommu_command_buffer(struct amd_iommu *iommu) +{ + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + iommu->reset_in_progress = true; + + amd_iommu_reset_cmd_buffer(iommu); + flush_all_devices_for_iommu(iommu); + flush_all_domains_on_iommu(iommu); + + iommu->reset_in_progress = false; +} + +void amd_iommu_flush_all_devices(void) +{ + flush_devices_by_domain(NULL); +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void) static int iommu_map_page(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, - int prot) + int prot, + int map_size) { u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); - /* only support 512GB address spaces for now */ - if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) + BUG_ON(!PM_ALIGNED(map_size, bus_addr)); + BUG_ON(!PM_ALIGNED(map_size, phys_addr)); + + if (!(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); + pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom, *pte = __pte; + update_domain(dom); + return 0; } static void iommu_unmap_page(struct protection_domain *dom, - unsigned long bus_addr) + unsigned long bus_addr, int map_size) { - u64 *pte; - - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + u64 *pte = fetch_pte(dom, bus_addr, map_size); - *pte = 0; + if (pte) + *pte = 0; } /* @@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, + PM_MAP_4k); if (ret) return ret; /* @@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * This function checks if there is a PTE for a given dma address. If * there is one, it returns the pointer to it. */ -static u64* fetch_pte(struct protection_domain *domain, - unsigned long address) +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { + int level; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + level -= 1; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } return pte; } @@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu, u64 *pte, *pte_page; for (i = 0; i < num_ptes; ++i) { - pte = alloc_pte(&dma_dom->domain, address, + pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, &pte_page, gfp); if (!pte) goto out_free; @@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu, for (i = dma_dom->aperture[index]->offset; i < dma_dom->aperture_size; i += PAGE_SIZE) { - u64 *pte = fetch_pte(&dma_dom->domain, i); + u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); if (!pte || !IOMMU_PTE_PRESENT(*pte)) continue; dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); } + update_domain(&dma_dom->domain); + return 0; out_free: + update_domain(&dma_dom->domain); + free_page((unsigned long)dma_dom->aperture[index]->bitmap); kfree(dma_dom->aperture[index]); @@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; - dma_dom->domain.mode = PAGE_MODE_3_LEVEL; + dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; @@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +static void set_dte_entry(u16 devid, struct protection_domain *domain) +{ + u64 pte_root = virt_to_phys(domain->pt_root); + + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + + amd_iommu_dev_table[devid].data[2] = domain->id; + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + + amd_iommu_pd_table[devid] = domain; +} + +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ +static void __attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) +{ + /* lock domain */ + spin_lock(&domain->lock); + + /* update DTE entry */ + set_dte_entry(devid, domain); + + domain->dev_cnt += 1; + + /* ready */ + spin_unlock(&domain->lock); +} + /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware @@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu, u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); - - domain->dev_cnt += 1; - - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[2] = domain->id; - - amd_iommu_pd_table[devid] = domain; + __attach_device(iommu, domain, devid); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - /* - * We might boot into a crash-kernel here. The crashed kernel - * left the caches in the IOMMU dirty. So we have to flush - * here to evict all dirty stuff. - */ + /* + * We might boot into a crash-kernel here. The crashed kernel + * left the caches in the IOMMU dirty. So we have to flush + * here to evict all dirty stuff. + */ iommu_queue_inv_dev_entry(iommu, devid); iommu_flush_tlb_pde(iommu, domain->id); } @@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid) /* ready */ spin_unlock(&domain->lock); + + /* + * If we run in passthrough mode the device must be assigned to the + * passthrough domain if it is detached from any other domain + */ + if (iommu_pass_through) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + __attach_device(iommu, pt_domain, devid); + } } /* @@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb, case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; + if (iommu_pass_through) + break; detach_device(domain, devid); break; case BUS_NOTIFY_ADD_DEVICE: @@ -1192,7 +1309,7 @@ out: return 0; } -struct notifier_block device_nb = { +static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; @@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev, return 1; } +static void update_device_table(struct protection_domain *domain) +{ + unsigned long flags; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] != domain) + continue; + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + set_dte_entry(i, domain); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + } +} + +static void update_domain(struct protection_domain *domain) +{ + if (!domain->updated) + return; + + update_device_table(domain); + flush_devices_by_domain(domain); + iommu_flush_domain(domain->id); + + domain->updated = false; +} + /* - * If the pte_page is not yet allocated this function is called + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. */ -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 **pte_page, gfp_t gfp) +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) +{ + u64 *pte; + + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; + + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; +} + +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { u64 *pte, *page; + int level; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } + level -= 1; - pte = IOMMU_PTE_PAGE(*pte); + pte = IOMMU_PTE_PAGE(*pte); - if (pte_page) - *pte_page = pte; + if (pte_page && level == end_lvl) + *pte_page = pte; - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + pte = &pte[PM_LEVEL_INDEX(level, address)]; + } return pte; } @@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { - pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, + GFP_ATOMIC); aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; } else - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); + + update_domain(&dom->domain); return pte; } @@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, if (!pte) return; - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); WARN_ON(!*pte); @@ -1763,7 +1935,7 @@ static void *alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) - return 0; + return NULL; paddr = virt_to_phys(virt_addr); @@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -static int amd_iommu_domain_init(struct iommu_domain *dom) +static void protection_domain_free(struct protection_domain *domain) +{ + if (!domain) + return; + + if (domain->id) + domain_id_free(domain->id); + + kfree(domain); +} + +static struct protection_domain *protection_domain_alloc(void) { struct protection_domain *domain; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) - return -ENOMEM; + return NULL; spin_lock_init(&domain->lock); - domain->mode = PAGE_MODE_3_LEVEL; domain->id = domain_id_alloc(); if (!domain->id) + goto out_err; + + return domain; + +out_err: + kfree(domain); + + return NULL; +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = protection_domain_alloc(); + if (!domain) goto out_free; + + domain->mode = PAGE_MODE_3_LEVEL; domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); if (!domain->pt_root) goto out_free; @@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) return 0; out_free: - kfree(domain); + protection_domain_free(domain); return -ENOMEM; } @@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr &= PAGE_MASK; for (i = 0; i < npages; ++i) { - ret = iommu_map_page(domain, iova, paddr, prot); + ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) return ret; @@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; for (i = 0; i < npages; ++i) { - iommu_unmap_page(domain, iova); + iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } @@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, phys_addr_t paddr; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + pte = fetch_pte(domain, iova, PM_MAP_4k); - if (!IOMMU_PTE_PRESENT(*pte)) + if (!pte || !IOMMU_PTE_PRESENT(*pte)) return 0; paddr = *pte & IOMMU_PAGE_MASK; @@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = { .domain_has_cap = amd_iommu_domain_has_cap, }; +/***************************************************************************** + * + * The next functions do a basic initialization of IOMMU for pass through + * mode + * + * In passthrough mode the IOMMU is initialized and enabled but not used for + * DMA-API translation. + * + *****************************************************************************/ + +int __init amd_iommu_init_passthrough(void) +{ + struct pci_dev *dev = NULL; + u16 devid, devid2; + + /* allocate passthroug domain */ + pt_domain = protection_domain_alloc(); + if (!pt_domain) + return -ENOMEM; + + pt_domain->mode |= PAGE_MODE_NONE; + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + struct amd_iommu *iommu; + + devid = calc_devid(dev->bus->number, dev->devfn); + if (devid > amd_iommu_last_bdf) + continue; + + devid2 = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid2]; + if (!iommu) + continue; + + __attach_device(iommu, pt_domain, devid); + __attach_device(iommu, pt_domain, devid2); + } + + pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); + + return 0; +} diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 10b2acc..b4b61d4 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); @@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) } /* + * This function resets the command buffer if the IOMMU stopped fetching + * commands from it. + */ +void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); +} + +/* * This function writes the command buffer address to the hardware and * enables it. */ @@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); - /* set head and tail to zero manually */ - writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - - iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); + amd_iommu_reset_cmd_buffer(iommu); } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -472,6 +482,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) if (iommu->evt_buf == NULL) return NULL; + iommu->evt_buf_size = EVT_BUFFER_SIZE; + return iommu->evt_buf; } @@ -691,6 +703,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; devid_to = e->ext >> 8; + set_dev_entry_from_acpi(iommu, devid , e->flags, 0); set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; @@ -749,11 +762,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - amd_iommu_alias_table[dev_i], - flags, ext_flags); + set_dev_entry_from_acpi(iommu, + devid_to, flags, ext_flags); + } + set_dev_entry_from_acpi(iommu, dev_i, + flags, ext_flags); } break; default: @@ -853,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) switch (*p) { case ACPI_IVHD_TYPE: - DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + DUMP_printk("device: %02x:%02x.%01x cap: %04x " "seg: %d flags: %01x info %04x\n", PCI_BUS(h->devid), PCI_SLOT(h->devid), PCI_FUNC(h->devid), h->cap_ptr, @@ -897,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) r = request_irq(iommu->dev->irq, amd_iommu_int_handler, IRQF_SAMPLE_RANDOM, - "AMD IOMMU", + "AMD-Vi", NULL); if (r) { @@ -1145,7 +1160,7 @@ int __init amd_iommu_init(void) if (no_iommu) { - printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); + printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); return 0; } @@ -1237,22 +1252,28 @@ int __init amd_iommu_init(void) if (ret) goto free; - ret = amd_iommu_init_dma_ops(); + if (iommu_pass_through) + ret = amd_iommu_init_passthrough(); + else + ret = amd_iommu_init_dma_ops(); if (ret) goto free; enable_iommus(); - printk(KERN_INFO "AMD IOMMU: device isolation "); + if (iommu_pass_through) + goto out; + + printk(KERN_INFO "AMD-Vi: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); else printk("disabled\n"); if (amd_iommu_unmap_flush) - printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); + printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else - printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); out: return ret; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 676debf..128111d 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,6 +20,7 @@ #include <linux/bitops.h> #include <linux/ioport.h> #include <linux/suspend.h> +#include <linux/kmemleak.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/iommu.h> @@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void) * code for safe */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(p); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8c7c042..159740d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -49,6 +49,7 @@ #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/mce.h> +#include <asm/kvm_para.h> unsigned int num_processors; @@ -140,7 +141,6 @@ int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ static int x2apic_preenabled; -static int disable_x2apic; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { @@ -149,7 +149,6 @@ static __init int setup_nox2apic(char *str) return 0; } - disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } @@ -1363,52 +1362,80 @@ void enable_x2apic(void) } #endif /* CONFIG_X86_X2APIC */ -void __init enable_IR_x2apic(void) +int __init enable_IR(void) { #ifdef CONFIG_INTR_REMAP - int ret; - unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; - - ret = dmar_table_init(); - if (ret) { - pr_debug("dmar_table_init() failed with %d:\n", ret); - goto ir_failed; - } - if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - goto ir_failed; + return 0; } - if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return; + return 0; } + if (enable_intr_remapping(x2apic_supported())) + return 0; + + pr_info("Enabled Interrupt-remapping\n"); + + return 1; + +#endif + return 0; +} + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + struct IO_APIC_route_entry **ioapic_entries = NULL; + int ret, x2apic_enabled = 0; + int dmar_table_init_ret = 0; + +#ifdef CONFIG_INTR_REMAP + dmar_table_init_ret = dmar_table_init(); + if (dmar_table_init_ret) + pr_debug("dmar_table_init() failed with %d:\n", + dmar_table_init_ret); +#endif + ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { - pr_info("Allocate ioapic_entries failed: %d\n", ret); - goto end; + pr_err("Allocate ioapic_entries failed\n"); + goto out; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto end; + goto out; } local_irq_save(flags); - mask_IO_APIC_setup(ioapic_entries); mask_8259A(); + mask_IO_APIC_setup(ioapic_entries); - ret = enable_intr_remapping(x2apic_supported()); - if (ret) - goto end_restore; + if (dmar_table_init_ret) + ret = 0; + else + ret = enable_IR(); - pr_info("Enabled Interrupt-remapping\n"); + if (!ret) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || !kvm_para_available()) + goto nox2apic; + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_force_phys(); + } + + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { x2apic_mode = 1; @@ -1416,41 +1443,25 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -end_restore: - if (ret) - /* - * IR enabling failed - */ +nox2apic: + if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); local_irq_restore(flags); -end: +out: if (ioapic_entries) free_ioapic_entries(ioapic_entries); - if (!ret) + if (x2apic_enabled) return; -ir_failed: if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); + panic("x2apic: enabled by BIOS but kernel init failed."); else if (cpu_has_x2apic) - pr_info("Not enabling x2apic,Intr-remapping\n"); -#else - if (!cpu_has_x2apic) - return; - - if (x2apic_preenabled) - panic("x2apic enabled prior OS handover," - " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); -#endif - - return; + pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); } - #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. @@ -1551,8 +1562,6 @@ no_apic: #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) { - unsigned long phys_addr; - /* * If no local APIC can be found then go out * : it means there is no mpatable and MADT @@ -1560,11 +1569,9 @@ void __init early_init_lapic_mapping(void) if (!smp_found_config) return; - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); + APIC_BASE, mp_lapic_addr); /* * Fetch the APIC ID of the BSP in case we have a @@ -1653,7 +1660,6 @@ int __init APIC_init_uniprocessor(void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } #endif diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 69328ac..89174f8 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void) { /* MPENTIUMIII */ if (boot_cpu_data.x86 == 6 && - (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) + (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) return 1; return 0; @@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, return ret && es7000_apic_is_cluster(); } -struct apic apic_es7000_cluster = { +/* We've been warned by a false positive warning.Use __refdata to keep calm. */ +struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 29d0752..3c8f9e7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,8 @@ #include <asm/apic.h> #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -85,6 +87,9 @@ int nr_ioapic_registers[MAX_IO_APICS]; struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; +/* IO APIC gsi routing info */ +struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; + /* MP IRQ source entries */ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; @@ -116,15 +121,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_pin_list; - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -139,6 +135,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; @@ -414,13 +415,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -428,9 +426,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -498,72 +493,68 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; + struct irq_pin_list **last, *entry; - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } - - while (entry->next) { - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; + return 0; + last = &entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } entry->apic = apic; entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; + struct irq_pin_list *entry; - while (entry) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } - entry = entry->next; } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -580,7 +571,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -596,11 +586,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { @@ -613,7 +598,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -1414,6 +1398,9 @@ int setup_ioapic_entry(int apic_id, int irq, irte.vector = vector; irte.dest_id = IRTE_DEST(destination); + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, apic_id); + modify_irte(irq, &irte); ir_entry->index2 = (index >> 15) & 0x1; @@ -1699,12 +1686,8 @@ __apicdebuginit(void) print_IO_APIC(void) if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -1713,25 +1696,19 @@ __apicdebuginit(void) print_IO_APIC(void) return; } -__apicdebuginit(void) print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_field(int base) { - unsigned int v; - int i, j; + int i; if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + + printk(KERN_CONT "\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1742,7 +1719,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) if (apic_verbosity == APIC_QUIET) return; - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); @@ -1783,11 +1760,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); + print_APIC_field(APIC_ISR); printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); + print_APIC_field(APIC_TMR); printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); + print_APIC_field(APIC_IRR); if (APIC_INTEGRATED(ver)) { /* !82489DX */ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ @@ -2004,7 +1981,9 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); + if (cpu_has_apic) + disconnect_bsp_APIC(!intr_remapping_enabled && + ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 @@ -2212,7 +2191,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2225,14 +2203,6 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -2270,13 +2240,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2289,9 +2255,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } @@ -2516,11 +2479,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2533,31 +2493,28 @@ static void ack_apic_level(unsigned int irq) } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2599,7 +2556,7 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2607,26 +2564,15 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void @@ -3242,8 +3188,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); @@ -3288,6 +3233,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); + /* Set source-id of interrupt request */ + set_msi_sid(&irte, pdev); + modify_irte(irq, &irte); msg->address_hi = MSI_ADDR_BASE_HI; @@ -3791,6 +3739,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + return irq; } @@ -3907,7 +3858,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); @@ -3936,11 +3891,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq, return __io_apic_set_pci_routing(dev, irq, irq_attr); } -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ +u8 __init io_apic_unique_id(u8 id) +{ +#ifdef CONFIG_X86_32 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +#else + int i; + DECLARE_BITMAP(used, 256); -#ifdef CONFIG_ACPI + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} #ifdef CONFIG_X86_32 int __init io_apic_get_unique_id(int ioapic, int apic_id) @@ -4049,8 +4021,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return 0; } -#endif /* CONFIG_ACPI */ - /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online @@ -4104,7 +4074,7 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(void) +static struct resource * __init ioapic_setup_resources(int nr_ioapics) { unsigned long n; struct resource *res; @@ -4120,15 +4090,13 @@ static struct resource * __init ioapic_setup_resources(void) mem = alloc_bootmem(n); res = (void *)mem; - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * nr_ioapics; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; } ioapic_resources = res; @@ -4142,7 +4110,7 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(); + ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].apicaddr; @@ -4171,36 +4139,99 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; } } -static int __init ioapic_insert_resources(void) +void __init ioapic_insert_resources(void) { int i; struct resource *r = ioapic_resources; if (!r) { - if (nr_ioapics > 0) { + if (nr_ioapics > 0) printk(KERN_ERR "IO APIC resources couldn't be allocated.\n"); - return -1; - } - return 0; + return; } for (i = 0; i < nr_ioapics; i++) { insert_resource(&iomem_resource, r); r++; } +} + +int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_gsi_routing[i].gsi_base) + && (gsi <= mp_gsi_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, int gsi) +{ + if (WARN_ON(ioapic == -1)) + return -1; + if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + return -1; + + return gsi - mp_gsi_routing[ioapic].gsi_base; +} +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + return 1; + } + if (!address) { + printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } return 0; } -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].type = MP_IOAPIC; + mp_ioapics[idx].flags = MPC_APIC_USABLE; + mp_ioapics[idx].apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].apicid = io_apic_unique_id(id); + mp_ioapics[idx].apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_gsi_routing[idx].gsi_base = gsi_base; + mp_gsi_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, + mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, + mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + + nr_ioapics++; +} diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445..08385e0 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; + if (WARN_ONCE(!mask, "empty IPI mask")) + return; + local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); @@ -150,7 +153,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!boot_cpu_has(X86_FEATURE_APIC)) + if (!cpu_has_apic) return 0; apicid = hard_smp_processor_id(); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index f422728..cb66a22 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) return 0; } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 533e59c..ca96e68 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void) (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); } -struct apic apic_numaq = { +/* Use __refdata to keep false positive warning calm. */ +struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 440a8bc..0c0182c 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -20,23 +20,12 @@ #include <asm/apic.h> #include <asm/setup.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <asm/mpspec.h> -#include <asm/fixmap.h> -#include <asm/apicdef.h> -#include <linux/kernel.h> -#include <linux/string.h> #include <linux/smp.h> -#include <linux/init.h> #include <asm/ipi.h> -#include <linux/smp.h> -#include <linux/init.h> #include <linux/interrupt.h> #include <asm/acpi.h> #include <asm/e820.h> -#include <asm/setup.h> #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880..65edc18 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -44,17 +44,22 @@ static struct apic *apic_probe[] __initdata = { NULL, }; +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic_mode && (apic != &apic_x2apic_phys && + if (x2apic_mode #ifdef CONFIG_X86_UV - apic != &apic_x2apic_uv_x && + && apic != &apic_x2apic_uv_x #endif - apic != &apic_x2apic_cluster)) { + ) { if (x2apic_phys) apic = &apic_x2apic_phys; else @@ -69,6 +74,11 @@ void __init default_setup_apic_routing(void) printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } + if (is_vsmp_box()) { + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; + } + /* * Now that apic routing model is selected, configure the * fault handling for intr remapping. diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 344eee4..eafdfbd1 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -44,7 +44,6 @@ #include <asm/ipi.h> #include <linux/kernel.h> #include <linux/string.h> -#include <linux/init.h> #include <linux/gfp.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8e4cbb2..a5371ec 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled(); } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } /* @@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a284359..a8989aa 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) @@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 096d19a..6011593 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -46,7 +46,7 @@ static int early_get_nodeid(void) return node_id.s.node_id; } -static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { if (!strcmp(oem_table_id, "UVL")) @@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -struct apic apic_x2apic_uv_x = { +struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = NULL, @@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = { .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .irq_dest_mode = 0, /* physical */ .target_cpus = uv_target_cpus, .disable_esr = 0, @@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } -static __init void map_low_mmrs(void) -{ - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); -} - enum map_type {map_wb, map_uc}; static __init void map_high(char *id, unsigned long base, int shift, @@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } -static __init void map_config_high(int max_pnode) -{ - union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; - int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; - - cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); - if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); -} - -static __init void map_mmr_high(int max_pnode) -{ - union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; - int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; - - mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); - if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); -} - static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -566,8 +540,6 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; - map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -591,6 +563,8 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) + uv_blade_info[blade].memory_nid = -1; get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); @@ -629,6 +603,9 @@ void __init uv_system_init(void) lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; + /* Any node on the blade, else will contain -1. */ + uv_blade_info[blade].memory_nid = nid; + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; @@ -662,11 +639,10 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; + max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); - map_mmr_high(max_pnode); - map_config_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9..151ace6 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -811,7 +819,7 @@ static int apm_do_idle(void) u8 ret = 0; int idled = 0; int polling; - int err; + int err = 0; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { @@ -2332,15 +2340,6 @@ static int __init apm_init(void) pm_flags |= PM_APM; /* - * Set up a segment that references the real mode segment 0x40 - * that extends up to the end of page zero (that we have reserved). - * This is for buggy BIOS's that refer to (real mode) segment 0x40 - * even though they are called in protected mode. - */ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); - - /* * Set up the long jump entry point to the APM BIOS, which is called * from inline assembly. */ @@ -2358,12 +2357,12 @@ static int __init apm_init(void) * code to that CPU. */ gdt = get_cpu_gdt_table(0); - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); proc_create("apm", 0, NULL, &apm_file_ops); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 898ecc4..4a6aeed 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -3,6 +3,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/crypto.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3efcb2b..8dd3063 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,9 +7,13 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg endif +# Make sure load_percpu_segment has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_common.o := $(nostackp) + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8..f32fa71 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -2,7 +2,7 @@ #include <linux/bitops.h> #include <linux/mm.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> @@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) #define CBAR_ENB (0x80000000) #define CBAR_KEY (0X000000CB) if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); } } @@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) d = d2-d; if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); + printk(KERN_CONT + "system stability may be impaired when more than 32 MB are used.\n"); else - printk("probably OK (after B9730xxxx).\n"); + printk(KERN_CONT "probably OK (after B9730xxxx).\n"); printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } @@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } @@ -251,6 +253,64 @@ static int __cpuinit nearby_node(int apicid) #endif /* + * Fixup core topology information for AMD multi-node processors. + * Assumption 1: Number of cores in each internal node is the same. + * Assumption 2: Mixed systems with both single-node and dual-node + * processors are not supported. + */ +#ifdef CONFIG_X86_HT +static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_PCI + u32 t, cpn; + u8 n, n_id; + int cpu = smp_processor_id(); + + /* fixup topology information only once for a core */ + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return; + + /* check for multi-node processor on boot cpu */ + t = read_pci_config(0, 24, 3, 0xe8); + if (!(t & (1 << 29))) + return; + + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + + /* cores per node: each internal node has half the number of cores */ + cpn = c->x86_max_cores >> 1; + + /* even-numbered NB_id of this dual-node processor */ + n = c->phys_proc_id << 1; + + /* + * determine internal node id and assign cores fifty-fifty to + * each node of the dual-node processor + */ + t = read_pci_config(0, 24 + n, 3, 0xe8); + n = (t>>30) & 0x3; + if (n == 0) { + if (c->cpu_core_id < cpn) + n_id = 0; + else + n_id = 1; + } else { + if (c->cpu_core_id < cpn) + n_id = 1; + else + n_id = 0; + } + + /* compute entire NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; + + /* fixup core id to be in range from 0 to cpn */ + c->cpu_core_id = c->cpu_core_id % cpn; +#endif +} +#endif + +/* * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ @@ -258,24 +318,40 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + /* fixup topology information on multi-node processors */ + if ((c->x86 == 0x10) && (c->x86_model == 9)) + amd_fixup_dcm(c); #endif } +int amd_get_nb_id(int cpu) +{ + int id = 0; +#ifdef CONFIG_SMP + id = per_cpu(cpu_llc_id, cpu); +#endif + return id; +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); + static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; + unsigned apicid = c->apicid; + + node = per_cpu(cpu_llc_id, cpu); - node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) node = apicid_to_node[apicid]; if (!node_online(node)) { @@ -354,7 +430,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) /* check CPU config space for extended APIC ID */ - if (c->x86 >= 0xf) { + if (cpu_has_apic && c->x86 >= 0xf) { unsigned int val; val = read_pci_config(0, 24, 0, 0x68); if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) @@ -396,11 +472,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) u32 level; level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). + */ + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + u64 val; + + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &val)) { + val &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, val); + } + } + } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); #else /* @@ -485,27 +580,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < + ((tseg>>PMD_SHIFT) < (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); } } #endif } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, + unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { - if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_mask == 0) size = 64; + /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ + (c->x86_mask == 0 || c->x86_mask == 1)) size = 256; } return size; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c8e315f..01a2652 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -81,7 +81,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk("Hmm, FPU with FDIV bug.\n"); + printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); } static void __init check_hlt(void) @@ -98,7 +98,7 @@ static void __init check_hlt(void) halt(); halt(); halt(); - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); } /* @@ -122,9 +122,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk("Buggy.\n"); + printk(KERN_CONT "Buggy.\n"); else - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); #endif } @@ -156,7 +156,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 9a3ed06..04f0fe5 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -15,7 +15,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #if !defined(CONFIG_SMP) - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif alternative_instructions(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5b9cb88..2055fc2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,8 +18,8 @@ #include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/sections.h> -#include <asm/topology.h> -#include <asm/cpumask.h> +#include <linux/topology.h> +#include <linux/cpumask.h> #include <asm/pgtable.h> #include <asm/atomic.h> #include <asm/proto.h> @@ -28,13 +28,13 @@ #include <asm/desc.h> #include <asm/i387.h> #include <asm/mtrr.h> -#include <asm/numa.h> +#include <linux/numa.h> #include <asm/asm.h> #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> -#include <asm/smp.h> +#include <linux/smp.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -71,45 +94,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; @@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -487,7 +487,6 @@ out: static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - static int printed; int i; for (i = 0; i < X86_VENDOR_NUM; i++) { @@ -504,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) } } - if (!printed) { - printed++; - printk(KERN_ERR - "CPU: vendor_id '%s' unknown, using generic init.\n", v); - - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } + printk_once(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; @@ -853,9 +848,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif - - /* Cap the iomem address space to what is addressable on all CPUs */ - iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 @@ -990,18 +982,26 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; /* @@ -1016,8 +1016,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ void syscall_init(void) @@ -1050,8 +1049,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist); #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, stack_canary); +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif /* Make sure %fs and %gs are initialized properly in idle threads */ diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52d..dca325c 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include <asm/apic.h> #include <asm/desc.h> -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b5032..4109679 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -60,7 +60,6 @@ enum { }; #define INTEL_MSR_RANGE (0xffff) -#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) struct acpi_cpufreq_data { struct acpi_processor_performance *acpi_data; @@ -71,11 +70,7 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); -struct acpi_msr_data { - u64 saved_aperf, saved_mperf; -}; - -static DEFINE_PER_CPU(struct acpi_msr_data, msr_data); +static DEFINE_PER_CPU(struct aperfmperf, old_perf); DEFINE_TRACE(power_mark); @@ -244,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask) return cmd.val; } -struct perf_pair { - union { - struct { - u32 lo; - u32 hi; - } split; - u64 whole; - } aperf, mperf; -}; - /* Called via smp_call_function_single(), on the target CPU */ static void read_measured_perf_ctrs(void *_cur) { - struct perf_pair *cur = _cur; + struct aperfmperf *am = _cur; - rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); - rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi); + get_aperfmperf(am); } /* @@ -279,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur) static unsigned int get_measured_perf(struct cpufreq_policy *policy, unsigned int cpu) { - struct perf_pair readin, cur; - unsigned int perf_percent; + struct aperfmperf perf; + unsigned long ratio; unsigned int retval; - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - cur.aperf.whole = readin.aperf.whole - - per_cpu(msr_data, cpu).saved_aperf; - cur.mperf.whole = readin.mperf.whole - - per_cpu(msr_data, cpu).saved_mperf; - per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole; - per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole; - -#ifdef __i386__ - /* - * We dont want to do 64 bit divide with 32 bit kernel - * Get an approximate value. Return failure in case we cannot get - * an approximate value. - */ - if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) { - int shift_count; - u32 h; - - h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi); - shift_count = fls(h); - - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) { - int shift_count = 7; - cur.aperf.split.lo >>= shift_count; - cur.mperf.split.lo >>= shift_count; - } - - if (cur.aperf.split.lo && cur.mperf.split.lo) - perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo; - else - perf_percent = 0; + ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); + per_cpu(old_perf, cpu) = perf; -#else - if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { - int shift_count = 7; - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (cur.aperf.whole && cur.mperf.whole) - perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; - else - perf_percent = 0; - -#endif - - retval = (policy->cpuinfo.max_freq * perf_percent) / 100; + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; return retval; } @@ -731,12 +669,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) acpi_processor_notify_smm(THIS_MODULE); /* Check for APERF/MPERF support in hardware */ - if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { - unsigned int ecx; - ecx = cpuid_ecx(6); - if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) - acpi_cpufreq_driver.getavg = get_measured_perf; - } + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + acpi_cpufreq_driver.getavg = get_measured_perf; dprintk("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index cf52215..2a50ef8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1,3 +1,4 @@ + /* * (c) 2003-2006 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the @@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data) u32 i = 0; if (cpu_family == CPU_HW_PSTATE) { - if (data->currpstate == HW_PSTATE_INVALID) { - /* read (initial) hw pstate if not yet set */ - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; - - /* - * a workaround for family 11h erratum 311 might cause - * an "out-of-range Pstate if the core is in Pstate-0 - */ - if (i >= data->numps) - data->currpstate = HW_PSTATE_0; - else - data->currpstate = i; - } + rdmsr(MSR_PSTATE_STATUS, lo, hi); + i = lo & HW_PSTATE_MASK; + data->currpstate = i; + + /* + * a workaround for family 11h erratum 311 might cause + * an "out-of-range Pstate if the core is in Pstate-0 + */ + if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) + data->currpstate = HW_PSTATE_0; + return 0; } do { @@ -301,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(data, reqvid)) + if (core_voltage_pre_transition(data, reqvid, reqfid)) return 1; if (core_frequency_transition(data, reqfid)) @@ -329,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, /* Phase 1 - core voltage transition ... setup voltage */ static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid) + u32 reqvid, u32 reqfid) { u32 rvosteps = data->rvo; u32 savefid = data->currfid; - u32 maxvid, lo; + u32 maxvid, lo, rvomult = 1; dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); @@ -353,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { if (data->currvid == maxvid) { rvosteps = 0; } else { @@ -386,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) u32 vcoreqfid, vcocurrfid, vcofiddiff; u32 fid_interval, savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2: illegal lo-lo transition " - "0x%x 0x%x\n", reqfid, data->currfid); - return 1; - } - if (data->currfid == reqfid) { printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); @@ -409,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + while (vcofiddiff > 2) { (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); @@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, return 0; } -static int check_supported_cpu(unsigned int cpu) +static void check_supported_cpu(void *_rc) { - cpumask_t oldmask; u32 eax, ebx, ecx, edx; - unsigned int rc = 0; + int *rc = _rc; - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); - goto out; - } + *rc = -ENODEV; if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) - goto out; + return; eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) - goto out; + return; if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); - goto out; + return; } eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { printk(KERN_INFO PFX "No frequency change capabilities detected\n"); - goto out; + return; } cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); @@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu) != P_STATE_TRANSITION_CAPABLE) { printk(KERN_INFO PFX "Power state transitions not supported\n"); - goto out; + return; } } else { /* must be a HW Pstate capable processor */ cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) cpu_family = CPU_HW_PSTATE; else - goto out; + return; } - rc = 1; - -out: - set_cpus_allowed_ptr(current, &oldmask); - return rc; + *rc = 0; } static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, @@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; - control = data->acpi_data.states[index].control; data->irt = (control - >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> - RVO_SHIFT) & RVO_MASK; data->exttype = (control - >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 - << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = - (control >> VST_SHIFT) & VST_MASK; } + control = data->acpi_data.states[index].control; + data->irt = (control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (control >> VST_SHIFT) & VST_MASK; +} static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { @@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data) if (cur_latency > max_latency) max_latency = cur_latency; } + if (max_latency == 0) { + /* + * Fam 11h always returns 0 as transition latency. + * This is intended and means "very fast". While cpufreq core + * and governors currently can handle that gracefully, better + * set it to 1 to avoid problems in the future. + * For all others it's a BIOS bug. + */ + if (!boot_cpu_data.x86 == 0x11) + printk(KERN_ERR FW_WARN PFX "Invalid zero transition " + "latency\n"); + max_latency = 1; + } /* value in usecs, needs to be in nanoseconds */ return 1000 * max_latency; } @@ -1080,20 +1081,12 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX - "ignoring illegal change in lo freq table-%x to 0x%x\n", - data->currfid, fid); - return 1; - } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1101,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -1126,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1134,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -1235,21 +1228,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol) return cpufreq_frequency_table_verify(pol, data->powernow_table); } -static const char ACPI_PSS_BIOS_BUG_MSG[] = - KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; +struct init_on_cpu { + struct powernow_k8_data *data; + int rc; +}; + +static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) +{ + struct init_on_cpu *init_on_cpu = _init_on_cpu; + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing init, change pending bit set\n"); + init_on_cpu->rc = -ENODEV; + return; + } + + if (query_current_values_with_pending_wait(init_on_cpu->data)) { + init_on_cpu->rc = -ENODEV; + return; + } + + if (cpu_family == CPU_OPTERON) + fidvid_msr_init(); + + init_on_cpu->rc = 0; +} /* per CPU init entry point to the driver */ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { + static const char ACPI_PSS_BIOS_BUG_MSG[] = + KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" + FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; - cpumask_t oldmask; + struct init_on_cpu init_on_cpu; int rc; if (!cpu_online(pol->cpu)) return -ENODEV; - if (!check_supported_cpu(pol->cpu)) + smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); + if (rc) return -ENODEV; data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); @@ -1289,27 +1308,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) pol->cpuinfo.transition_latency = get_transition_latency(data); /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out_unmask; - } - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing init, change pending bit set\n"); - goto err_out_unmask; - } - - if (query_current_values_with_pending_wait(data)) - goto err_out_unmask; - - if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - /* run on any CPU again */ - set_cpus_allowed_ptr(current, &oldmask); + init_on_cpu.data = data; + smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, + &init_on_cpu, 1); + rc = init_on_cpu.rc; + if (rc != 0) + goto err_out_exit_acpi; if (cpu_family == CPU_HW_PSTATE) cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); @@ -1346,8 +1350,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) return 0; -err_out_unmask: - set_cpus_allowed_ptr(current, &oldmask); +err_out_exit_acpi: powernow_k8_cpu_exit_acpi(data); err_out: @@ -1372,28 +1375,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) return 0; } +static void query_values_on_cpu(void *_err) +{ + int *err = _err; + struct powernow_k8_data *data = __get_cpu_var(powernow_data); + + *err = query_current_values_with_pending_wait(data); +} + static unsigned int powernowk8_get(unsigned int cpu) { - struct powernow_k8_data *data; - cpumask_t oldmask = current->cpus_allowed; + struct powernow_k8_data *data = per_cpu(powernow_data, cpu); unsigned int khz = 0; - unsigned int first; - - first = cpumask_first(cpu_core_mask(cpu)); - data = per_cpu(powernow_data, first); + int err; if (!data) return -EINVAL; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX - "limiting to CPU %d failed in powernowk8_get\n", cpu); - set_cpus_allowed_ptr(current, &oldmask); - return 0; - } - - if (query_current_values_with_pending_wait(data)) + smp_call_function_single(cpu, query_values_on_cpu, &err, true); + if (err) goto out; if (cpu_family == CPU_HW_PSTATE) @@ -1404,7 +1404,6 @@ static unsigned int powernowk8_get(unsigned int cpu) out: - set_cpus_allowed_ptr(current, &oldmask); return khz; } @@ -1430,7 +1429,9 @@ static int __cpuinit powernowk8_init(void) unsigned int i, supported_cpus = 0; for_each_online_cpu(i) { - if (check_supported_cpu(i)) + int rc; + smp_call_function_single(i, check_supported_cpu, &rc, 1); + if (rc == 0) supported_cpus++; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 6c6698f..02ce824 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -215,7 +215,8 @@ struct pst_s { #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) -static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); @@ -223,14 +224,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); - -#ifdef CONFIG_SMP -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ -} -#else -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ - cpu_set(0, cpu_sharedcore_mask[0]); -} -#endif diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 55c831e..8d672ef 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu) { unsigned l, h; unsigned clock_freq; - cpumask_t saved_mask; - saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) - return 0; - - rdmsr(MSR_IA32_PERF_STATUS, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); clock_freq = extract_clock(l, cpu, 0); if (unlikely(clock_freq == 0)) { @@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu) * P-state transition (like TM2). Get the last freq set * in PERF_CTL. */ - rdmsr(MSR_IA32_PERF_CTL, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); clock_freq = extract_clock(l, cpu, 1); } - - set_cpus_allowed_ptr(current, &saved_mask); return clock_freq; } @@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy, struct cpufreq_freqs freqs; int retval = 0; unsigned int j, k, first_cpu, tmp; - cpumask_var_t saved_mask, covered_cpus; + cpumask_var_t covered_cpus; - if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) - return -ENOMEM; - if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { - free_cpumask_var(saved_mask); + if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) return -ENOMEM; - } - cpumask_copy(saved_mask, ¤t->cpus_allowed); if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { retval = -ENODEV; @@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy, first_cpu = 1; for_each_cpu(j, policy->cpus) { - const struct cpumask *mask; + int good_cpu; /* cpufreq holds the hotplug lock, so we are safe here */ if (!cpu_online(j)) @@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy, * Make sure we are running on CPU that wants to change freq */ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - mask = policy->cpus; + good_cpu = cpumask_any_and(policy->cpus, + cpu_online_mask); else - mask = cpumask_of(j); + good_cpu = j; - set_cpus_allowed_ptr(current, mask); - preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), *mask))) { + if (good_cpu >= nr_cpu_ids) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { /* We haven't started the transition yet. */ - goto migrate_end; + goto out; } - preempt_enable(); break; } msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { - rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); + rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); if (msr == (oldmsr & 0xffff)) { dprintk("no change needed - msr was and needs " "to be %x\n", oldmsr); retval = 0; - goto migrate_end; + goto out; } freqs.old = extract_clock(oldmsr, cpu, 0); @@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy, oldmsr |= msr; } - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - preempt_enable(); + wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) break; - } - cpu_set(j, *covered_cpus); - preempt_enable(); + cpumask_set_cpu(j, covered_cpus); } for_each_cpu(k, policy->cpus) { @@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - for_each_cpu_mask_nr(j, *covered_cpus) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); - } + for_each_cpu(j, covered_cpus) + wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); tmp = freqs.new; freqs.new = freqs.old; @@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy, cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, saved_mask); retval = 0; - goto out; -migrate_end: - preempt_enable(); - set_cpus_allowed_ptr(current, saved_mask); out: - free_cpumask_var(saved_mask); free_cpumask_var(covered_cpus); return retval; } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 016c1a4..6911e91 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -89,7 +89,8 @@ static int speedstep_find_register(void) * speedstep_set_state - set the SpeedStep state * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) * - * Tries to change the SpeedStep state. + * Tries to change the SpeedStep state. Can be called from + * smp_call_function_single. */ static void speedstep_set_state(unsigned int state) { @@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state) return; } +/* Wrapper for smp_call_function_single. */ +static void _speedstep_set_state(void *_state) +{ + speedstep_set_state(*(unsigned int *)_state); +} /** * speedstep_activate - activate SpeedStep control in the chipset @@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -static unsigned int _speedstep_get(const struct cpumask *cpus) -{ +struct get_freq_data { unsigned int speed; - cpumask_t cpus_allowed; - - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpus); - speed = speedstep_get_frequency(speedstep_processor); - set_cpus_allowed_ptr(current, &cpus_allowed); - dprintk("detected %u kHz as current frequency\n", speed); - return speed; + unsigned int processor; +}; + +static void get_freq_data(void *_data) +{ + struct get_freq_data *data = _data; + + data->speed = speedstep_get_frequency(data->processor); } static unsigned int speedstep_get(unsigned int cpu) { - return _speedstep_get(cpumask_of(cpu)); + struct get_freq_data data = { .processor = cpu }; + + /* You're supposed to ensure CPU is online. */ + if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + BUG(); + + dprintk("detected %u kHz as current frequency\n", data.speed); + return data.speed; } /** @@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - unsigned int newstate = 0; + unsigned int newstate = 0, policy_cpu; struct cpufreq_freqs freqs; - cpumask_t cpus_allowed; int i; if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) return -EINVAL; - freqs.old = _speedstep_get(policy->cpus); + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); + freqs.old = speedstep_get(policy_cpu); freqs.new = speedstep_freqs[newstate].frequency; freqs.cpu = policy->cpu; @@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy, if (freqs.old == freqs.new) return 0; - cpus_allowed = current->cpus_allowed; - for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } - /* switch to physical CPU where state is to be changed */ - set_cpus_allowed_ptr(current, policy->cpus); - - speedstep_set_state(newstate); - - /* allow to be run on all CPUs */ - set_cpus_allowed_ptr(current, &cpus_allowed); + smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, + true); for_each_cpu(i, policy->cpus) { freqs.cpu = i; @@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy) return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); } +struct get_freqs { + struct cpufreq_policy *policy; + int ret; +}; + +static void get_freqs_on_cpu(void *_get_freqs) +{ + struct get_freqs *get_freqs = _get_freqs; + + get_freqs->ret = + speedstep_get_freqs(speedstep_processor, + &speedstep_freqs[SPEEDSTEP_LOW].frequency, + &speedstep_freqs[SPEEDSTEP_HIGH].frequency, + &get_freqs->policy->cpuinfo.transition_latency, + &speedstep_set_state); +} static int speedstep_cpu_init(struct cpufreq_policy *policy) { - int result = 0; - unsigned int speed; - cpumask_t cpus_allowed; + int result; + unsigned int policy_cpu, speed; + struct get_freqs gf; /* only run on CPU to be set, or on its sibling */ #ifdef CONFIG_SMP cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); #endif - - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, policy->cpus); + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); /* detect low and high frequency and transition latency */ - result = speedstep_get_freqs(speedstep_processor, - &speedstep_freqs[SPEEDSTEP_LOW].frequency, - &speedstep_freqs[SPEEDSTEP_HIGH].frequency, - &policy->cpuinfo.transition_latency, - &speedstep_set_state); - set_cpus_allowed_ptr(current, &cpus_allowed); - if (result) - return result; + gf.policy = policy; + smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); + if (gf.ret) + return gf.ret; /* get current speed setting */ - speed = _speedstep_get(policy->cpus); + speed = speedstep_get(policy_cpu); if (!speed) return -EIO; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 2e3c686..f4c290b 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void) } +/* Warning: may get called from smp_call_function_single. */ unsigned int speedstep_get_frequency(unsigned int processor) { switch (processor) { diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 593171e..19807b8 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -3,10 +3,10 @@ #include <linux/delay.h> #include <linux/pci.h> #include <asm/dma.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> -#include <asm/timer.h> +#include <linux/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> @@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * The 5510/5520 companion chips have a funky PIT. */ if (vendor == PCI_VENDOR_ID_CYRIX && - (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif @@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * ? : 0x7x * GX1 : 0x8x GX1 datasheet 56 */ - if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); return; } else { /* MediaGX */ @@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); local_irq_restore(flags); } } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index fb5b86a..93ba8ee 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,11 +28,10 @@ static inline void __cpuinit detect_hypervisor_vendor(struct cpuinfo_x86 *c) { - if (vmware_platform()) { + if (vmware_platform()) c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; - } else { + else c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; - } } unsigned long get_hypervisor_tsc_freq(void) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index daed39b..40e1835 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,17 +7,17 @@ #include <linux/sched.h> #include <linux/thread_info.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/ds.h> #include <asm/bugs.h> #include <asm/cpu.h> #ifdef CONFIG_X86_64 -#include <asm/topology.h> +#include <linux/topology.h> #include <asm/numa_64.h> #endif @@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); + +#ifdef CONFIG_KMEMCHECK + /* + * P4s have a "fast strings" feature which causes single- + * stepping REP instructions to only generate a #DB on + * cache-line boundaries. + * + * Ingo Molnar reported a Pentium D (model 6) and a Xeon + * (model 2) with the same problem. + */ + if (c->x86 == 15) { + u64 misc_enable; + + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + + if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { + printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); + + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; + wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + } + } +#endif } #ifdef CONFIG_X86_32 @@ -151,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; @@ -184,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); } } @@ -260,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) /* Intel has a non-standard dependency on %ecx for this CPUID level. */ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) - return ((eax >> 26) + 1); + return (eax >> 26) + 1; else return 1; } @@ -326,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } + if (c->cpuid_level > 6) { + unsigned ecx = cpuid_ecx(6); + if (ecx & 0x01) + set_cpu_cap(c, X86_FEATURE_APERFMPERF); + } + if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 789efe2..804c40e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -3,7 +3,7 @@ * * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. + * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -16,7 +16,7 @@ #include <linux/pci.h> #include <asm/processor.h> -#include <asm/smp.h> +#include <linux/smp.h> #include <asm/k8.h> #define LVL_1_INST 1 @@ -25,14 +25,15 @@ #define LVL_3 4 #define LVL_TRACE 5 -struct _cache_table -{ +struct _cache_table { unsigned char descriptor; char cache_type; short size; }; -/* all the cache descriptor types we care about (no TLB or trace cache entries) */ +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + static const struct _cache_table __cpuinitconst cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ @@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = }; -enum _cache_type -{ +enum _cache_type { CACHE_TYPE_NULL = 0, CACHE_TYPE_DATA = 1, CACHE_TYPE_INST = 2, @@ -170,31 +170,31 @@ unsigned short num_cache_leaves; Maybe later */ union l1_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 8; - unsigned assoc : 8; - unsigned size_in_kb : 8; + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; }; unsigned val; }; union l2_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned size_in_kb : 16; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; }; unsigned val; }; union l3_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned res : 2; - unsigned size_encoded : 14; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; }; unsigned val; }; @@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 0: if (!l1->val) return; - assoc = l1->assoc; + assoc = assocs[l1->assoc]; line_size = l1->line_size; lines_per_tag = l1->lines_per_tag; size_in_kb = l1->size_in_kb; @@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 2: if (!l2.val) return; - assoc = l2.assoc; + assoc = assocs[l2.assoc]; line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ @@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 3: if (!l3.val) return; - assoc = l3.assoc; + assoc = assocs[l3.assoc]; line_size = l3.line_size; lines_per_tag = l3.lines_per_tag; size_in_kb = l3.size_encoded * 512; + if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; + } break; default: return; @@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.is_self_initializing = 1; eax->split.type = types[leaf]; eax->split.level = levels[leaf]; - if (leaf == 3) - eax->split.num_threads_sharing = - current_cpu_data.x86_max_cores - 1; - else - eax->split.num_threads_sharing = 0; + eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; - if (assoc == 0xf) + if (assoc == 0xffff) eax->split.is_fully_associative = 1; ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assocs[assoc] - 1; + ebx->split.ways_of_associativity = assoc - 1; ebx->split.physical_line_partition = lines_per_tag - 1; ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / (ebx->split.ways_of_associativity + 1) - 1; @@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void) unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; @@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) retval = cpuid4_cache_lookup_regs(i, &this_leaf); if (retval >= 0) { - switch(this_leaf.eax.split.level) { - case 1: + switch (this_leaf.eax.split.level) { + case 1: if (this_leaf.eax.split.type == CACHE_TYPE_DATA) new_l1d = this_leaf.size/1024; @@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) CACHE_TYPE_INST) new_l1i = this_leaf.size/1024; break; - case 2: + case 2: new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l2_id = c->apicid >> index_msb; break; - case 3: + case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); + index_msb = get_count_order( + num_threads_sharing); l3_id = c->apicid >> index_msb; break; - default: + default: break; } } @@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; - for ( i = 0 ; i < n ; i++ ) { + for (i = 0 ; i < n ; i++) { cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for ( j = 0 ; j < 3 ; j++ ) { - if (regs[j] & (1 << 31)) regs[j] = 0; - } + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; /* Byte 0 is level count, not a descriptor */ - for ( j = 1 ; j < 16 ; j++ ) { + for (j = 1 ; j < 16 ; j++) { unsigned char des = dp[j]; unsigned char k = 0; /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) - { + while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { if (only_trace && cache_table[k].cache_type != LVL_TRACE) break; @@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) } if (trace) - printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if ( l1i ) - printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); + printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); + else if (l1i) + printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); if (l1d) - printk(", L1 D cache: %dK\n", l1d); + printk(KERN_CONT ", L1 D cache: %dK\n", l1d); else - printk("\n"); + printk(KERN_CONT "\n"); if (l2) printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); @@ -522,6 +523,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) int index_msb, i; struct cpuinfo_x86 *c = &cpu_data(cpu); + if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { + struct cpuinfo_x86 *d; + for_each_online_cpu(i) { + if (!per_cpu(cpuid4_info, i)) + continue; + d = &cpu_data(i); + this_leaf = CPUID4_INFO_IDX(i, index); + cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), + d->llc_shared_map); + } + return; + } this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; @@ -558,8 +571,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +} + +static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +{ +} #endif static void __cpuinit free_cache_attributes(unsigned int cpu) @@ -645,7 +663,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); static ssize_t show_##file_name \ (struct _cpuid4_info *this_leaf, char *buf) \ { \ - return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ + return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } show_one_plus(level, eax.split.level, 0); @@ -656,7 +674,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) { - return sprintf (buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -669,7 +687,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, const struct cpumask *mask; mask = to_cpumask(this_leaf->shared_cpu_map); - n = type? + n = type ? cpulist_scnprintf(buf, len-2, mask) : cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; @@ -800,7 +818,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -static struct attribute * default_attrs[] = { +static struct attribute *default_attrs[] = { &type.attr, &level.attr, &coherency_line_size.attr, @@ -815,7 +833,7 @@ static struct attribute * default_attrs[] = { NULL }; -static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -828,8 +846,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) return ret; } -static ssize_t store(struct kobject * kobj, struct attribute * attr, - const char * buf, size_t count) +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -883,7 +901,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) goto err_out; per_cpu(index_kobject, cpu) = kzalloc( - sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); + sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); if (unlikely(per_cpu(index_kobject, cpu) == NULL)) goto err_out; @@ -917,7 +935,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } for (i = 0; i < num_cache_leaves; i++) { - this_object = INDEX_KOBJECT_PTR(cpu,i); + this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; retval = kobject_init_and_add(&(this_object->kobj), @@ -925,9 +943,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) per_cpu(cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { - for (j = 0; j < i; j++) { - kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); - } + for (j = 0; j < i; j++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; @@ -952,7 +969,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); for (i = 0; i < num_cache_leaves; i++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); + kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } @@ -977,8 +994,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = -{ +static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 325559d..2f5aab2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -165,6 +165,11 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +void __weak decode_mce(struct mce *m) +{ + return; +} + static void print_mce(struct mce *m) { printk(KERN_EMERG @@ -176,28 +181,33 @@ static void print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk("\n"); + printk(KERN_CONT "\n"); } printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %llx ", m->addr); + printk(KERN_CONT "ADDR %llx ", m->addr); if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); + printk(KERN_CONT "MISC %llx ", m->misc); + printk(KERN_CONT "\n"); printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + decode_mce(m); } static void print_mce_head(void) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + printk(KERN_EMERG "\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) + "Run through mcelog --ascii to decode and contact your hardware vendor\n" +#endif + ); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -1085,7 +1095,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1104,14 +1114,14 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); t->expires = jiffies + *n; - add_timer(t); + add_timer_on(t, smp_processor_id()); } static void mce_do_trigger(struct work_struct *work) @@ -1235,8 +1245,13 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + return -EOPNOTSUPP; + } + /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { if (c->x86 == 15 && banks > 4) { @@ -1283,14 +1298,19 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; - /* There are also broken BIOSes on some Pentium M systems. */ - if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; if (mce_bootlog != 0) mce_panic_timeout = 30; + + return 0; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) @@ -1324,7 +1344,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1334,7 +1354,7 @@ static void mce_init_timer(void) return; setup_timer(t, mcheck_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); - add_timer(t); + add_timer_on(t, smp_processor_id()); } /* Handle unconfigured int18 (should never happen) */ @@ -1362,11 +1382,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; - if (mce_cap_init() < 0) { + if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { mce_disabled = 1; return; } - mce_cpu_quirks(c); machine_check_vector = do_machine_check; @@ -1720,17 +1739,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, const char *buf, size_t siz) { char *p; - int len; strncpy(mce_helper, buf, sizeof(mce_helper)); mce_helper[sizeof(mce_helper)-1] = 0; - len = strlen(mce_helper); p = strchr(mce_helper, '\n'); - if (*p) + if (p) *p = 0; - return len; + return strlen(mce_helper) + !!p; } static ssize_t set_ignore_ce(struct sys_device *s, @@ -1944,7 +1961,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae216..8cd5224 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { @@ -489,12 +489,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_core_mask(cpu)); + i = cpumask_first(c->llc_shared_map); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -514,7 +516,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, cpu_core_mask(cpu)); + cpumask_copy(b->cpus, c->llc_shared_map); per_cpu(threshold_banks, cpu)[bank] = b; goto out; @@ -539,7 +541,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, cpu_core_mask(cpu)); + cpumask_copy(b->cpus, c->llc_shared_map); #endif per_cpu(threshold_banks, cpu)[bank] = b; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 15f2bc0..63a56d1 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -36,6 +36,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +static DEFINE_PER_CPU(bool, thermal_throttle_active); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -96,27 +97,33 @@ static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + return 1; + } + if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + return 1; } - return 1; + return 0; } #ifdef CONFIG_SYSFS diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index ee2331b..33af141 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -7,15 +7,15 @@ static void amd_get_mtrr(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type) + unsigned long *size, mtrr_type *type) { unsigned long low, high; rdmsr(MSR_K6_UWCCR, low, high); - /* Upper dword is region 1, lower is region 0 */ + /* Upper dword is region 1, lower is region 0 */ if (reg == 1) low = high; - /* The base masks off on the right alignment */ + /* The base masks off on the right alignment */ *base = (low & 0xFFFE0000) >> PAGE_SHIFT; *type = 0; if (low & 1) @@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base, return; } /* - * This needs a little explaining. The size is stored as an - * inverted mask of bits of 128K granularity 15 bits long offset - * 2 bits + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. * - * So to get a size we do invert the mask and add 1 to the lowest - * mask bit (4 as its 2 bits in). This gives us a size we then shift - * to turn into 128K blocks + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. * - * eg 111 1111 1111 1100 is 512K + * eg 111 1111 1111 1100 is 512K * - * invert 000 0000 0000 0011 - * +1 000 0000 0000 0100 - * *128K ... + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... */ low = (~low) & 0x1FFFC; *size = (low + 4) << (15 - PAGE_SHIFT); - return; } -static void amd_set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - <reg> The register to set. - <base> The base address of the region. - <size> The size of the region. If this is 0 the region is disabled. - <type> The type of the region. - [RETURNS] Nothing. -*/ +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { u32 regs[2]; /* - * Low is MTRR0 , High MTRR 1 + * Low is MTRR0, High MTRR 1 */ rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); /* - * Blank to disable + * Blank to disable */ - if (size == 0) + if (size == 0) { regs[reg] = 0; - else - /* Set the register to the base, the type (off by one) and an - inverted bitmask of the size The size is the only odd - bit. We are fed say 512K We invert this and we get 111 1111 - 1111 1011 but if you subtract one and invert you get the - desired 111 1111 1111 1100 mask - - But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) | (base << PAGE_SHIFT) | (type + 1); + } /* - * The writeback rule is quite specific. See the manual. Its - * disable local interrupts, write back the cache, set the mtrr + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr */ wbinvd(); wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); } -static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { - /* Apply the K6 block alignment and size rules - In order - o Uncached or gathering only - o 128K or bigger block - o Power of 2 block - o base suitably aligned to the power - */ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; @@ -115,5 +122,3 @@ int __init amd_init_mtrr(void) set_mtrr_ops(&amd_mtrr_ops); return 0; } - -//arch_initcall(amd_mtrr_init); diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index cb9aa3a..de89f14 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -1,7 +1,9 @@ #include <linux/init.h> #include <linux/mm.h> + #include <asm/mtrr.h> #include <asm/msr.h> + #include "mtrr.h" static struct { @@ -12,25 +14,25 @@ static struct { static u8 centaur_mcr_reserved; static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ -/* - * Report boot time MCR setups +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. */ - static int centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - <base> The starting (base) address of the region. - <size> The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { if (centaur_mcr_reserved & (1 << i)) continue; @@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) if (lsize == 0) return i; } + return -ENOSPC; } -void -mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +/* + * Report boot time MCR setups + */ +void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { centaur_mcr[mcr].low = lo; centaur_mcr[mcr].high = hi; @@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base, { *base = centaur_mcr[reg].high >> PAGE_SHIFT; *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; - *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) *type = MTRR_TYPE_WRBACK; if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) *type = MTRR_TYPE_WRBACK; - } -static void centaur_set_mcr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) { unsigned long low, high; if (size == 0) { - /* Disable */ + /* Disable */ high = low = 0; } else { high = base << PAGE_SHIFT; - if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ - else { + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << PAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ } } centaur_mcr[reg].high = high; @@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base, wrmsr(MSR_IDT_MCR0 + reg, low, high); } -#if 0 -/* - * Initialise the later (saner) Winchip MCR variant. In this version - * the BIOS can pass us the registers it has used (but not their values) - * and the control register is read/write - */ - -static void __init -centaur_mcr1_init(void) -{ - unsigned i; - u32 lo, hi; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ - lo &= ~0x1C0; /* clear key */ - lo |= 0x040; /* set key to 1 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ - } - - centaur_mcr_type = 1; - - /* - * Clear any unconfigured MCR's. - */ - - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { - if (!(lo & (1 << (9 + i)))) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - else - /* - * If the BIOS set up an MCR we cannot see it - * but we don't wish to obliterate it - */ - centaur_mcr_reserved |= (1 << i); - } - } - /* - * Throw the main write-combining switch... - * However if OOSTORE is enabled then people have already done far - * cleverer things and we should behave. - */ - - lo |= 15; /* Write combine enables */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -/* - * Initialise the original winchip with read only MCR registers - * no used bitmask for the BIOS to pass on and write only control - */ - -static void __init -centaur_mcr0_init(void) -{ - unsigned i; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - /* Clear any unconfigured MCR's. - * This way we are sure that the centaur_mcr array contains the actual - * values. The disadvantage is that any BIOS tweaks are thus undone. - * - */ - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - } - - wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ -} - -/* - * Initialise Winchip series MCR registers - */ - -static void __init -centaur_mcr_init(void) -{ - struct set_mtrr_context ctxt; - - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - if (boot_cpu_data.x86_model == 4) - centaur_mcr0_init(); - else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) - centaur_mcr1_init(); - - set_mtrr_done(&ctxt); -} -#endif - -static int centaur_validate_add_page(unsigned long base, - unsigned long size, unsigned int type) +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { /* - * FIXME: Winchip2 supports uncached + * FIXME: Winchip2 supports uncached */ - if (type != MTRR_TYPE_WRCOMB && + if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { - printk(KERN_WARNING - "mtrr: only write-combining%s supported\n", - centaur_mcr_type ? " and uncacheable are" - : " is"); + pr_warning("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); return -EINVAL; } return 0; @@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base, static struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, -// .init = centaur_mcr_init, .set = centaur_set_mcr, .get = centaur_get_mcr, .get_free_region = centaur_get_free_region, @@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void) set_mtrr_ops(¢aur_mtrr_ops); return 0; } - -//arch_initcall(centaur_init_mtrr); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 1d584a1..315738c 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -1,51 +1,75 @@ -/* MTRR (Memory Type Range Register) cleanup - - Copyright (C) 2009 Yinghai Lu - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ #include <linux/module.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/smp.h> #include <linux/cpu.h> -#include <linux/mutex.h> #include <linux/sort.h> +#include <linux/mutex.h> +#include <linux/uaccess.h> +#include <linux/kvm_para.h> +#include <asm/processor.h> #include <asm/e820.h> #include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> #include <asm/msr.h> -#include <asm/kvm_para.h> -#include "mtrr.h" -/* should be related to MTRR_VAR_RANGES nums */ -#define RANGE_NUM 256 +#include "mtrr.h" struct res_range { - unsigned long start; - unsigned long end; + unsigned long start; + unsigned long end; +}; + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; }; +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + + static int __init -add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { - /* out of slots */ + /* Out of slots: */ if (nr_range >= RANGE_NUM) return nr_range; @@ -58,12 +82,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start, } static int __init -add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range_with_merge(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { int i; - /* try to merge it with old one */ + /* Try to merge it with old one: */ for (i = 0; i < nr_range; i++) { unsigned long final_start, final_end; unsigned long common_start, common_end; @@ -84,7 +108,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, return nr_range; } - /* need to add that */ + /* Need to add it: */ return add_range(range, nr_range, start, end); } @@ -117,7 +141,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) } if (start > range[j].start && end < range[j].end) { - /* find the new spare */ + /* Find the new spare: */ for (i = 0; i < RANGE_NUM; i++) { if (range[i].end == 0) break; @@ -146,14 +170,8 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } -struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; -}; - -static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; -static int __initdata debug_print; +#define BIOS_BUG_MSG KERN_WARNING \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, @@ -180,7 +198,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, range[i].start, range[i].end + 1); } - /* take out UC ranges */ + /* Take out UC ranges: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_UNCACHABLE && @@ -193,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && (mtrr_state.enabled & 1)) { /* Var MTRR contains UC entry below 1M? Skip it: */ - printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " - "contains strange UC entry under 1M, check " - "with your system vendor!\n", i); + printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) continue; size -= (1<<(20-PAGE_SHIFT)) - base; @@ -237,17 +253,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, return nr_range; } -static struct res_range __initdata range[RANGE_NUM]; -static int __initdata nr_range; - #ifdef CONFIG_MTRR_SANITIZER static unsigned long __init sum_ranges(struct res_range *range, int nr_range) { - unsigned long sum; + unsigned long sum = 0; int i; - sum = 0; for (i = 0; i < nr_range; i++) sum += range[i].end + 1 - range[i].start; @@ -278,17 +290,9 @@ static int __init mtrr_cleanup_debug_setup(char *str) } early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); -struct var_mtrr_state { - unsigned long range_startk; - unsigned long range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; -}; - static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) + unsigned char type, unsigned int address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -301,7 +305,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, mask = (1ULL << address_bits) - 1; mask &= ~((((u64)sizek) << 10) - 1); - base = ((u64)basek) << 10; + base = ((u64)basek) << 10; base |= type; mask |= 0x800; @@ -317,15 +321,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, static void __init save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type) + unsigned char type) { range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); range_state[reg].type = type; } -static void __init -set_var_mtrr_all(unsigned int address_bits) +static void __init set_var_mtrr_all(unsigned int address_bits) { unsigned long basek, sizek; unsigned char type; @@ -342,11 +345,11 @@ set_var_mtrr_all(unsigned int address_bits) static unsigned long to_size_factor(unsigned long sizek, char *factorp) { - char factor; unsigned long base = sizek; + char factor; if (base & ((1<<10) - 1)) { - /* not MB alignment */ + /* Not MB-aligned: */ factor = 'K'; } else if (base & ((1<<20) - 1)) { factor = 'M'; @@ -372,11 +375,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long max_align, align; unsigned long sizek; - /* Compute the maximum size I can make a range */ + /* Compute the maximum size with which we can make a range: */ if (range_startk) max_align = ffs(range_startk) - 1; else max_align = 32; + align = fls(range_sizek) - 1; if (align > max_align) align = max_align; @@ -386,11 +390,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; - start_base = to_size_factor(range_startk, - &start_factor), - size_base = to_size_factor(sizek, &size_factor), + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); - printk(KERN_DEBUG "Setting variable MTRR %d, " + Dprintk("Setting variable MTRR %d, " "base: %ld%cB, range: %ld%cB, type %s\n", reg, start_base, start_factor, size_base, size_factor, @@ -425,10 +428,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; - /* align with gran size, prevent small block used up MTRRs */ + /* Align with gran size, prevent small block used up MTRRs: */ range_basek = ALIGN(state->range_startk, gran_sizek); if ((range_basek > basek) && basek) return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); range_sizek = ALIGN(state->range_sizek, gran_sizek); @@ -439,22 +443,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } state->range_sizek = range_sizek; - /* try to append some small hole */ + /* Try to append some small hole: */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - /* no increase */ + /* No increase: */ if (range0_sizek == state->range_sizek) { - if (debug_print) - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + state->range_sizek)<<10); + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK); return 0; } - /* only cut back, when it is not the last */ + /* Only cut back when it is not the last: */ if (sizek) { while (range0_basek + range0_sizek > (basek + sizek)) { if (range0_sizek >= chunk_sizek) @@ -470,16 +473,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_try: range_basek = range0_basek + range0_sizek; - /* one hole in the middle */ + /* One hole in the middle: */ if (range_basek > basek && range_basek <= (basek + sizek)) second_sizek = range_basek - basek; if (range0_sizek > state->range_sizek) { - /* one hole in middle or at end */ + /* One hole in middle or at the end: */ hole_sizek = range0_sizek - state->range_sizek - second_sizek; - /* hole size should be less than half of range0 size */ + /* Hole size should be less than half of range0 size: */ if (hole_sizek >= (range0_sizek >> 1) && range0_sizek >= chunk_sizek) { range0_sizek -= chunk_sizek; @@ -491,32 +494,30 @@ second_try: } if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); } if (range0_sizek < state->range_sizek) { - /* need to handle left over */ + /* Need to handle left over range: */ range_sizek = state->range_sizek - range0_sizek; - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, MTRR_TYPE_WRBACK); } if (hole_sizek) { hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, MTRR_TYPE_UNCACHABLE); } @@ -537,23 +538,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, basek = base_pfn << (PAGE_SHIFT - 10); sizek = size_pfn << (PAGE_SHIFT - 10); - /* See if I can merge with the last range */ + /* See if I can merge with the last range: */ if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { unsigned long endk = basek + sizek; state->range_sizek = endk - state->range_startk; return; } - /* Write the range mtrrs */ + /* Write the range mtrrs: */ if (state->range_sizek != 0) second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - /* Allocate an msr */ + /* Allocate an msr: */ state->range_startk = basek + second_sizek; state->range_sizek = sizek - second_sizek; } -/* mininum size of mtrr block that can take hole */ +/* Mininum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) @@ -565,7 +566,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) } early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); -/* granity of mtrr of block */ +/* Granularity of mtrr of block: */ static u64 mtrr_gran_size __initdata; static int __init parse_mtrr_gran_size_opt(char *p) @@ -577,7 +578,7 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static int nr_mtrr_spare_reg __initdata = +static unsigned long nr_mtrr_spare_reg __initdata = CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; static int __init parse_mtrr_spare_reg(char *arg) @@ -586,7 +587,6 @@ static int __init parse_mtrr_spare_reg(char *arg) nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); return 0; } - early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init @@ -594,8 +594,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; - int i; int num_reg; + int i; var_state.range_startk = 0; var_state.range_sizek = 0; @@ -605,17 +605,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, memset(range_state, 0, sizeof(range_state)); - /* Write the range etc */ - for (i = 0; i < nr_range; i++) + /* Write the range: */ + for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + } - /* Write the last range */ + /* Write the last range: */ if (var_state.range_sizek != 0) range_to_mtrr_with_hole(&var_state, 0, 0); num_reg = var_state.reg; - /* Clear out the extra MTRR's */ + /* Clear out the extra MTRR's: */ while (var_state.reg < num_var_ranges) { save_var_mtrr(var_state.reg, 0, 0, 0); var_state.reg++; @@ -625,11 +626,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, } struct mtrr_cleanup_result { - unsigned long gran_sizek; - unsigned long chunk_sizek; - unsigned long lose_cover_sizek; - unsigned int num_reg; - int bad; + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; }; /* @@ -645,10 +646,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static void __init print_out_mtrr_range_state(void) { - int i; char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; mtrr_type type; + int i; for (i = 0; i < num_var_ranges; i++) { @@ -676,10 +677,10 @@ static int __init mtrr_need_cleanup(void) int i; mtrr_type type; unsigned long size; - /* extra one for all 0 */ + /* Extra one for all 0: */ int num[MTRR_NUM_TYPES + 1]; - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -693,88 +694,86 @@ static int __init mtrr_need_cleanup(void) num[type]++; } - /* check if we got UC entries */ + /* Check if we got UC entries: */ if (!num[MTRR_TYPE_UNCACHABLE]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) + num_var_ranges - num[MTRR_NUM_TYPES]) return 0; return 1; } static unsigned long __initdata range_sums; -static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, - unsigned long extra_remove_base, - unsigned long extra_remove_size, - int i) + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) { - int num_reg; static struct res_range range_new[RANGE_NUM]; - static int nr_range_new; unsigned long range_sums_new; + static int nr_range_new; + int num_reg; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - /* we got new setting in range_state, check it */ + /* We got new setting in range_state, check it: */ memset(range_new, 0, sizeof(range_new)); nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); + x_remove_base, x_remove_size); range_sums_new = sum_ranges(range_new, nr_range_new); result[i].chunk_sizek = chunk_size >> 10; result[i].gran_sizek = gran_size >> 10; result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } - /* double check it */ + /* Double check it: */ if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; } static void __init mtrr_print_out_one_result(int i) { - char gran_factor, chunk_factor, lose_factor; unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad ? "*BAD*" : " ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad ? "-" : "", - lose_base, lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); } static int __init mtrr_search_optimal_index(void) { - int i; int num_reg_good; int index_good; + int i; if (nr_mtrr_spare_reg >= num_var_ranges) nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { if (!min_loss_pfn[i]) @@ -796,24 +795,24 @@ static int __init mtrr_search_optimal_index(void) return index_good; } - int __init mtrr_cleanup(unsigned address_bits) { - unsigned long extra_remove_base, extra_remove_size; + unsigned long x_remove_base, x_remove_size; unsigned long base, size, def, dummy; - mtrr_type type; u64 chunk_size, gran_size; + mtrr_type type; int index_good; int i; if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -822,29 +821,28 @@ int __init mtrr_cleanup(unsigned address_bits) range_state[i].type = type; } - /* check if we need handle it and can handle it */ + /* Check if we need handle it and can handle it: */ if (!mtrr_need_cleanup()) return 0; - /* print original var MTRRs at first, for debugging: */ + /* Print original var MTRRs at first, for debugging: */ printk(KERN_DEBUG "original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); - extra_remove_size = 0; - extra_remove_base = 1 << (32 - PAGE_SHIFT); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); if (mtrr_tom2) - extra_remove_size = - (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, - extra_remove_size); + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); /* - * [0, 1M) should always be coverred by var mtrr with WB - * and fixed mtrrs should take effective before var mtrr for it + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); - /* sort the ranges */ + /* Sort the ranges: */ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); @@ -854,7 +852,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { i = 0; mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); mtrr_print_out_one_result(i); @@ -880,7 +878,7 @@ int __init mtrr_cleanup(unsigned address_bits) continue; mtrr_calc_range_state(chunk_size, gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); if (debug_print) { mtrr_print_out_one_result(i); printk(KERN_INFO "\n"); @@ -890,7 +888,7 @@ int __init mtrr_cleanup(unsigned address_bits) } } - /* try to find the optimal index */ + /* Try to find the optimal index: */ index_good = mtrr_search_optimal_index(); if (index_good != -1) { @@ -898,7 +896,7 @@ int __init mtrr_cleanup(unsigned address_bits) i = index_good; mtrr_print_out_one_result(i); - /* convert ranges to var ranges state */ + /* Convert ranges to var ranges state: */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; @@ -941,8 +939,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup); * Note this won't check if the MTRRs < 4GB where the magic bit doesn't * apply to are wrong, but so far we don't know of any such case in the wild. */ -#define Tom2Enabled (1U << 21) -#define Tom2ForceMemTypeWB (1U << 22) +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) int __init amd_special_default_mtrr(void) { @@ -952,7 +950,7 @@ int __init amd_special_default_mtrr(void) return 0; if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) return 0; - /* In case some hypervisor doesn't pass SYSCFG through */ + /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) return 0; /* @@ -965,19 +963,21 @@ int __init amd_special_default_mtrr(void) return 0; } -static u64 __init real_trim_memory(unsigned long start_pfn, - unsigned long limit_pfn) +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) { u64 trim_start, trim_size; + trim_start = start_pfn; trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); + return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } + /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs * @end_pfn: ending page frame number @@ -985,7 +985,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn, * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches * the end of memory, to make sure the MTRRs having a write back type cover - * all of the memory the kernel is intending to use. If not, it'll trim any + * all of the memory the kernel is intending to use. If not, it'll trim any * memory off the end by adjusting end_pfn, removing it from the kernel's * allocation pools, warning the user with an obnoxious message. */ @@ -994,21 +994,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 total_trim_size; - /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; + /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -1017,7 +1018,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) range_state[i].type = type; } - /* Find highest cached pfn */ + /* Find highest cached pfn: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_WRBACK) @@ -1028,13 +1029,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) highest_pfn = base + size; } - /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ if (!highest_pfn) { printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); return 0; } - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -1046,11 +1047,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) num[type]++; } - /* no entry for WB? */ + /* No entry for WB? */ if (!num[MTRR_TYPE_WRBACK]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC: */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != num_var_ranges - num[MTRR_NUM_TYPES]) return 0; @@ -1066,31 +1067,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + /* Check the head: */ total_trim_size = 0; - /* check the head */ if (range[0].start) total_trim_size += real_trim_memory(0, range[0].start); - /* check the holes */ + + /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) total_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); } - /* check the top */ + + /* Check the top: */ i = nr_range - 1; if (range[i].end + 1 < end_pfn) total_trim_size += real_trim_memory(range[i].end + 1, end_pfn); if (total_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_trim_size >> 20); + pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); if (!changed_by_mtrr_cleanup) WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr\n"); + pr_info("update e820 for mtrr\n"); update_e820(); return 1; @@ -1098,4 +1099,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } - diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ff14c32..228d982 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -1,38 +1,40 @@ #include <linux/init.h> +#include <linux/io.h> #include <linux/mm.h> -#include <asm/mtrr.h> -#include <asm/msr.h> -#include <asm/io.h> + #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + #include "mtrr.h" static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) { - unsigned long flags; unsigned char arr, ccr3, rcr, shift; + unsigned long flags; arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - /* Save flags and disable interrupts */ local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ((unsigned char *) base)[3] = getCx86(arr); - ((unsigned char *) base)[2] = getCx86(arr + 1); - ((unsigned char *) base)[1] = getCx86(arr + 2); + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); rcr = getCx86(CX86_RCR_BASE + reg); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ - /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); + shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; - /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. */ if (shift) @@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, } } +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ static int cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free ARR. - <base> The starting (base) address of the region. - <size> The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i; switch (replace_reg) { case 7: @@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) cyrix_get_arr(7, &lbase, &lsize, <ype); if (lsize == 0) return 7; - /* Else try ARR0-ARR6 first */ + /* Else try ARR0-ARR6 first */ } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } - /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ cyrix_get_arr(i, &lbase, &lsize, <ype); if ((lsize == 0) && (size >= 0x40)) return i; @@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) return -ENOSPC; } -static u32 cr4 = 0; -static u32 ccr3; +static u32 cr4, ccr3; static void prepare_set(void) { u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } - /* Disable and flush caches. Note that wbinvd flushes the TLBs as - a side-effect */ + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); @@ -147,22 +156,21 @@ static void prepare_set(void) /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); - } static void post_set(void) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ccr3); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); } @@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, size >>= 6; size &= 0x7fff; /* make sure arr_size <= 14 */ - for (arr_size = 0; size; arr_size++, size >>= 1) ; + for (arr_size = 0; size; arr_size++, size >>= 1) + ; if (reg < 7) { switch (type) { @@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, prepare_set(); base <<= PAGE_SHIFT; - setCx86(arr, ((unsigned char *) &base)[3]); - setCx86(arr + 1, ((unsigned char *) &base)[2]); - setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); setCx86(CX86_RCR_BASE + reg, arr_type); post_set(); } typedef struct { - unsigned long base; - unsigned long size; - mtrr_type type; + unsigned long base; + unsigned long size; + mtrr_type type; } arr_state_t; static arr_state_t arr_state[8] = { @@ -247,16 +256,17 @@ static void cyrix_set_all(void) setCx86(CX86_CCR0 + i, ccr_state[i]); for (; i < 7; i++) setCx86(CX86_CCR4 + i, ccr_state[i]); - for (i = 0; i < 8; i++) - cyrix_set_arr(i, arr_state[i].base, + + for (i = 0; i < 8; i++) { + cyrix_set_arr(i, arr_state[i].base, arr_state[i].size, arr_state[i].type); + } post_set(); } static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, -// .init = cyrix_arr_init, .set_all = cyrix_set_all, .set = cyrix_set_arr, .get = cyrix_get_arr, @@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void) set_mtrr_ops(&cyrix_mtrr_ops); return 0; } - -//arch_initcall(cyrix_init_mtrr); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0543f69..55da0c5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,28 +1,34 @@ -/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - because MTRRs can span upto 40 bits (36bits on most modern x86) */ +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span upto 40 bits (36bits on most modern x86) + */ +#define DEBUG + +#include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/io.h> #include <linux/mm.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/msr.h> -#include <asm/system.h> -#include <asm/cpufeature.h> + #include <asm/processor-flags.h> +#include <asm/cpufeature.h> #include <asm/tlbflush.h> +#include <asm/system.h> +#include <asm/mtrr.h> +#include <asm/msr.h> #include <asm/pat.h> + #include "mtrr.h" struct fixed_range_block { - int base_msr; /* start address of an MTRR block */ - int ranges; /* number of MTRRs in this block */ + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { - { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -30,10 +36,10 @@ static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; -struct mtrr_state_type mtrr_state = {}; +struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); -/** +/* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section @@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) { + if (!(mtrr_state.enabled & 2)) return mtrr_state.def_type; - } prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { @@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) if (start_state != end_state) return 0xFE; - if ((start & mask) != (base & mask)) { + if ((start & mask) != (base & mask)) continue; - } curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; if (prev_match == 0xFF) { @@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) curr_match = MTRR_TYPE_WRTHROUGH; } - if (prev_match != curr_match) { + if (prev_match != curr_match) return MTRR_TYPE_UNCACHABLE; - } } if (mtrr_tom2) { @@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end) return mtrr_state.def_type; } -/* Get the MSR pair relating to a var range */ +/* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { @@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -/* fill the MSR pair relating to a var range */ +/* Fill the MSR pair relating to a var range */ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) { @@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index, vr[index].mask_hi = mask_hi; } -static void -get_fixed_ranges(mtrr_type * frs) +static void get_fixed_ranges(mtrr_type *frs) { - unsigned int *p = (unsigned int *) frs; + unsigned int *p = (unsigned int *)frs; int i; k8_check_syscfg_dram_mod_en(); @@ -217,22 +219,22 @@ static void __init print_fixed_last(void) if (!last_fixed_end) return; - printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, - last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + pr_debug(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } static void __init update_fixed_last(unsigned base, unsigned end, - mtrr_type type) + mtrr_type type) { last_fixed_start = base; last_fixed_end = end; last_fixed_type = type; } -static void __init print_fixed(unsigned base, unsigned step, - const mtrr_type *types) +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) { unsigned i; @@ -259,54 +261,55 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - printk(KERN_DEBUG "MTRR default type: %s\n", - mtrr_attrib_to_str(mtrr_state.def_type)); + pr_debug("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + pr_debug("MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) - print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) - print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); /* tail */ print_fixed_last(); } - printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + pr_debug("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); if (size_or_mask & 0xffffffffUL) high_width = ffs(size_or_mask & 0xffffffffUL) - 1; else high_width = ffs(size_or_mask>>32) + 32 - 1; high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); else - printk(KERN_DEBUG " %u disabled\n", i); - } - if (mtrr_tom2) { - printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", - mtrr_tom2, mtrr_tom2>>20); + pr_debug(" %u disabled\n", i); } + if (mtrr_tom2) + pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } -/* Grab all of the MTRR state for this CPU into *state */ +/* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { - unsigned int i; struct mtrr_var_range *vrs; - unsigned lo, dummy; unsigned long flags; + unsigned lo, dummy; + unsigned int i; vrs = mtrr_state.var_ranges; @@ -324,6 +327,7 @@ void __init get_mtrr_state(void) if (amd_special_default_mtrr()) { unsigned low, high; + /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); mtrr_tom2 = high; @@ -344,10 +348,9 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); - } -/* Some BIOS's are fucked and don't set all MTRRs the same! */ +/* Some BIOS's are messed up and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; @@ -355,28 +358,33 @@ void __init mtrr_state_warn(void) if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); printk(KERN_INFO "mtrr: corrected configuration.\n"); } -/* Doesn't attempt to pass an error out to MTRR users - because it's quite complicated in some cases and probably not - worth it because the best error handling is to ignore it. */ +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { - if (wrmsr_safe(msr, a, b) < 0) + if (wrmsr_safe(msr, a, b) < 0) { printk(KERN_ERR "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); + } } /** - * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have @@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) * * Returns: The index of the region on success, else negative on error. */ -int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } + return -ENOSPC; } @@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { - /* Invalid (i.e. free) range */ + /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; @@ -471,27 +482,31 @@ out_put_cpu: } /** - * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ -static int set_fixed_ranges(mtrr_type * frs) +static int set_fixed_ranges(mtrr_type *frs) { - unsigned long long *saved = (unsigned long long *) frs; + unsigned long long *saved = (unsigned long long *)frs; bool changed = false; - int block=-1, range; + int block = -1, range; k8_check_syscfg_dram_mod_en(); - while (fixed_range_blocks[++block].ranges) - for (range=0; range < fixed_range_blocks[block].ranges; range++) - set_fixed_range(fixed_range_blocks[block].base_msr + range, - &changed, (unsigned int *) saved++); + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } return changed; } -/* Set the MSR pair relating to a var range. Returns TRUE if - changes are made */ +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; @@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; } @@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi; */ static unsigned long set_mtrr_state(void) { - unsigned int i; unsigned long change_mask = 0; + unsigned int i; - for (i = 0; i < num_var_ranges; i++) + for (i = 0; i < num_var_ranges; i++) { if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; - /* Set_mtrr_restore restores the old value of MTRRdefType, - so to set it we fiddle with the saved value */ + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ if ((deftype_lo & 0xff) != mtrr_state.def_type || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); + + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | + (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void) } -static unsigned long cr4 = 0; +static unsigned long cr4; static DEFINE_SPINLOCK(set_atomicity_lock); /* - * Since we are disabling the cache don't allow any interrupts - they - * would run extremely slow and would only increase the pain. The caller must - * ensure that local interrupts are disabled and are reenabled after post_set() - * has been called. + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after post_set() has been called. */ - static void prepare_set(void) __acquires(set_atomicity_lock) { unsigned long cr0; - /* Note that this is not ideal, since the cache is only flushed/disabled - for this CPU while the MTRRs are changed, but changing this requires - more invasive changes to the way the kernel boots */ + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ spin_lock(&set_atomicity_lock); - /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } @@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ __flush_tlb(); - /* Save MTRR state */ + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - /* Disable MTRRs, and set the default type to uncached */ + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) { - /* Flush TLBs (no need to flush caches - they are disabled) */ + /* Flush TLBs (no need to flush caches - they are disabled) */ __flush_tlb(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); spin_unlock(&set_atomicity_lock); } @@ -623,24 +647,27 @@ static void generic_set_all(void) post_set(); local_irq_restore(flags); - /* Use the atomic bitops to update the global mask */ + /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } - + } +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - <reg> The register to set. - <base> The base address of the region. - <size> The size of the region. If this is 0 the region is disabled. - <type> The type of the region. - [RETURNS] Nothing. -*/ { unsigned long flags; struct mtrr_var_range *vr; @@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, prepare_set(); if (size == 0) { - /* The invalid bit is kept in the mask, so we simply clear the - relevant mask register to disable a range. */ + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { @@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, local_irq_restore(flags); } -int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) { unsigned long lbase, last; - /* For Intel PPro stepping <= 7, must be 4 MiB aligned - and not touch 0x70000000->0x7003FFFF */ + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { - printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } - /* Check upper bits of base and last are equal and lower bits are 0 - for base and 1 for last */ + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); - lbase = lbase >> 1, last = last >> 1) ; + lbase = lbase >> 1, last = last >> 1) + ; if (lbase != last) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", - base, size); + pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; } - static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); - return (config & (1 << 10)); + return config & (1 << 10); } int positive_have_wrcomb(void) @@ -716,14 +749,15 @@ int positive_have_wrcomb(void) return 1; } -/* generic structure... +/* + * Generic structure... */ struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .set_all = generic_set_all, - .get = generic_get_mtrr, - .get_free_region = generic_get_free_region, - .set = generic_set_mtrr, - .validate_add_page = generic_validate_add_page, - .have_wrcomb = generic_have_wrcomb, + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, }; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index fb73a52..08b6ea4 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -1,27 +1,28 @@ -#include <linux/init.h> -#include <linux/proc_fs.h> #include <linux/capability.h> -#include <linux/ctype.h> -#include <linux/module.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> #define LINE_SIZE 80 #include <asm/mtrr.h> + #include "mtrr.h" #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = { - "uncachable", /* 0 */ - "write-combining", /* 1 */ - "?", /* 2 */ - "?", /* 3 */ - "write-through", /* 4 */ - "write-protect", /* 5 */ - "write-back", /* 6 */ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ }; const char *mtrr_attrib_to_str(int x) @@ -35,8 +36,8 @@ static int mtrr_file_add(unsigned long base, unsigned long size, unsigned int type, bool increment, struct file *file, int page) { + unsigned int *fcount = FILE_FCOUNT(file); int reg, max; - unsigned int *fcount = FILE_FCOUNT(file); max = num_var_ranges; if (fcount == NULL) { @@ -61,8 +62,8 @@ static int mtrr_file_del(unsigned long base, unsigned long size, struct file *file, int page) { - int reg; unsigned int *fcount = FILE_FCOUNT(file); + int reg; if (!page) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) @@ -81,13 +82,14 @@ mtrr_file_del(unsigned long base, unsigned long size, return reg; } -/* RED-PEN: seq_file can seek now. this is ignored. */ +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ static ssize_t mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) -/* Format of control line: - "base=%Lx size=%Lx type=%s" OR: - "disable=%d" -*/ { int i, err; unsigned long reg; @@ -100,15 +102,18 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EPERM; if (!len) return -EINVAL; + memset(line, 0, LINE_SIZE); if (len > LINE_SIZE) len = LINE_SIZE; if (copy_from_user(line, buf, len - 1)) return -EFAULT; + linelen = strlen(line); ptr = line + linelen - 1; if (linelen && *ptr == '\n') *ptr = '\0'; + if (!strncmp(line, "disable=", 8)) { reg = simple_strtoul(line + 8, &ptr, 0); err = mtrr_del_page(reg, 0, 0); @@ -116,28 +121,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return err; return len; } + if (strncmp(line, "base=", 5)) return -EINVAL; + base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "size=", 5)) return -EINVAL; + size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; - err = - mtrr_add_page((unsigned long) base, (unsigned long) size, i, - true); + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); if (err < 0) return err; return len; @@ -181,7 +193,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC32_SET_PAGE_ENTRY: case MTRRIOC32_DEL_PAGE_ENTRY: case MTRRIOC32_KILL_PAGE_ENTRY: { - struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; err = get_user(sentry.base, &s32->base); err |= get_user(sentry.size, &s32->size); err |= get_user(sentry.type, &s32->type); @@ -191,7 +205,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) } case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = get_user(gentry.regnum, &g32->regnum); err |= get_user(gentry.base, &g32->base); err |= get_user(gentry.size, &g32->size); @@ -314,7 +330,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (err) return err; - switch(cmd) { + switch (cmd) { case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: if (copy_to_user(arg, &gentry, sizeof gentry)) @@ -323,7 +339,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = put_user(gentry.base, &g32->base); err |= put_user(gentry.size, &g32->size); err |= put_user(gentry.regnum, &g32->regnum); @@ -335,11 +353,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) return err; } -static int -mtrr_close(struct inode *ino, struct file *file) +static int mtrr_close(struct inode *ino, struct file *file) { - int i, max; unsigned int *fcount = FILE_FCOUNT(file); + int i, max; if (fcount != NULL) { max = num_var_ranges; @@ -359,22 +376,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset); static int mtrr_open(struct inode *inode, struct file *file) { - if (!mtrr_if) + if (!mtrr_if) return -EIO; - if (!mtrr_if->get) - return -ENXIO; + if (!mtrr_if->get) + return -ENXIO; return single_open(file, mtrr_seq_show, NULL); } static const struct file_operations mtrr_fops = { - .owner = THIS_MODULE, - .open = mtrr_open, - .read = seq_read, - .llseek = seq_lseek, - .write = mtrr_write, - .unlocked_ioctl = mtrr_ioctl, - .compat_ioctl = mtrr_ioctl, - .release = mtrr_close, + .owner = THIS_MODULE, + .open = mtrr_open, + .read = seq_read, + .llseek = seq_lseek, + .write = mtrr_write, + .unlocked_ioctl = mtrr_ioctl, + .compat_ioctl = mtrr_ioctl, + .release = mtrr_close, }; static int mtrr_seq_show(struct seq_file *seq, void *offset) @@ -388,23 +405,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); - if (size == 0) + if (size == 0) { mtrr_usage_table[i] = 0; - else { - if (size < (0x100000 >> PAGE_SHIFT)) { - /* less than 1MB */ - factor = 'K'; - size <<= PAGE_SHIFT - 10; - } else { - factor = 'M'; - size >>= 20 - PAGE_SHIFT; - } - /* RED-PEN: base can be > 32bit */ - len += seq_printf(seq, - "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_usage_table[i], mtrr_attrib_to_str(type)); + continue; } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + len += seq_printf(seq, "reg%02i: base=0x%06lx000 " + "(%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), size, + factor, mtrr_usage_table[i], + mtrr_attrib_to_str(type)); } return 0; } @@ -422,6 +440,5 @@ static int __init mtrr_if_init(void) proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); return 0; } - arch_initcall(mtrr_if_init); #endif /* CONFIG_PROC_FS */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8fc248b..84e83de 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -25,43 +25,49 @@ Operating System Writer's Guide" (Intel document number 242692), section 11.11.7 - This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> - on 6-7 March 2002. - Source: Intel Architecture Software Developers Manual, Volume 3: + This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: System Programming Guide; Section 9.11. (1997 edition - PPro). */ +#define DEBUG + +#include <linux/types.h> /* FIXME: kvm_para.h needs this */ + +#include <linux/kvm_para.h> +#include <linux/uaccess.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/init.h> +#include <linux/sort.h> +#include <linux/cpu.h> #include <linux/pci.h> #include <linux/smp.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/sort.h> +#include <asm/processor.h> #include <asm/e820.h> #include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> #include <asm/msr.h> -#include <asm/kvm_para.h> + #include "mtrr.h" -u32 num_var_ranges = 0; +u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +static bool mtrr_aps_delayed_init; -static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; +static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops * mtrr_if = NULL; +struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops * ops) +void set_mtrr_ops(struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; @@ -72,30 +78,36 @@ static int have_wrcomb(void) { struct pci_dev *dev; u8 rev; - - if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { - /* ServerWorks LE chipsets < rev 6 have problems with write-combining - Don't allow it and leave room for other chipsets to be tagged */ + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev <= 5) { - printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } } - /* Intel 450NX errata # 23. Non ascending cacheline evictions to - write combining memory may resulting in data corruption */ + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ if (dev->vendor == PCI_VENDOR_ID_INTEL && dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } pci_dev_put(dev); - } - return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; } /* This function returns the number of variable MTRRs */ @@ -103,12 +115,13 @@ static void __init set_num_var_ranges(void) { unsigned long config = 0, dummy; - if (use_intel()) { + if (use_intel()) rdmsr(MSR_MTRRcap, config, dummy); - } else if (is_cpu(AMD)) + else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; + num_var_ranges = config & 0xff; } @@ -130,10 +143,12 @@ struct set_mtrr_data { mtrr_type smp_type; }; +/** + * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * + * Returns nothing. + */ static void ipi_handler(void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; @@ -142,18 +157,22 @@ static void ipi_handler(void *info) local_irq_save(flags); atomic_dec(&data->count); - while(!atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ - if (data->smp_reg != ~0U) - mtrr_if->set(data->smp_reg, data->smp_base, + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - else + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); + } atomic_dec(&data->count); - while(atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); @@ -161,7 +180,8 @@ static void ipi_handler(void *info) #endif } -static inline int types_compatible(mtrr_type type1, mtrr_type type2) { +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ return type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || @@ -176,10 +196,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * @type: mtrr type * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: - * + * * 1. Send IPI to do the following: * 2. Disable Interrupts - * 3. Wait for all procs to do so + * 3. Wait for all procs to do so * 4. Enter no-fill cache mode * 5. Flush caches * 6. Clear PGE bit @@ -189,26 +209,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * 10. Enable all range registers * 11. Flush all TLBs and caches again * 12. Enter normal cache mode and reenable caching - * 13. Set PGE + * 13. Set PGE * 14. Wait for buddies to catch up * 15. Enable interrupts. - * + * * What does that mean for us? Well, first we set data.count to the number * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it - * differently, so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate to - * be reset. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. + * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * CPU goes through the transition of updating MTRRs. + * The CPU vendors may each do it differently, + * so we call mtrr_if->set() callback and let them take care of it. + * When they're done, they again decrement data->count and wait for data.gate + * to be reset. + * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { struct set_mtrr_data data; unsigned long flags; @@ -218,121 +239,124 @@ static void set_mtrr(unsigned int reg, unsigned long base, data.smp_size = size; data.smp_type = type; atomic_set(&data.count, num_booting_cpus() - 1); - /* make sure data.count is visible before unleashing other CPUs */ + + /* Make sure data.count is visible before unleashing other CPUs */ smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); - /* Start the ball rolling on other CPUs */ + /* Start the ball rolling on other CPUs */ if (smp_call_function(ipi_handler, &data, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); local_irq_save(flags); - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); - /* ok, reset count and toggle gate */ + /* Ok, reset count and toggle gate */ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,1); + atomic_set(&data.gate, 1); - /* do our MTRR business */ + /* Do our MTRR business */ - /* HACK! + /* + * HACK! * We use this same function to initialize the mtrrs on boot. * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. + * to replicate across all the APs. * If we're doing that @reg is set to something special... */ - if (reg != ~0U) - mtrr_if->set(reg,base,size,type); + if (reg != ~0U) + mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); - /* wait for the others */ - while(atomic_read(&data.count)) + /* Wait for the others */ + while (atomic_read(&data.count)) cpu_relax(); atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); /* * Wait here for everyone to have seen the gate change * So we're the last ones to touch 'data' */ - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); local_irq_restore(flags); } /** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (in units of 4 kB!) - * @size: Physical size of region in pages (4 kB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int mtrr_add_page(unsigned long base, unsigned long size, +int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment) { + unsigned long lbase, lsize; int i, replace, error; mtrr_type ltype; - unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; - - if ((error = mtrr_if->validate_add_page(base,size,type))) + + error = mtrr_if->validate_add_page(base, size, type); + if (error) return error; if (type >= MTRR_NUM_TYPES) { - printk(KERN_WARNING "mtrr: type: %u invalid\n", type); + pr_warning("mtrr: type: %u invalid\n", type); return -EINVAL; } - /* If the type is WC, check that this processor supports it */ + /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - printk(KERN_WARNING - "mtrr: your processor doesn't support write-combining\n"); + pr_warning("mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - printk(KERN_WARNING "mtrr: zero sized request\n"); + pr_warning("mtrr: zero sized request\n"); return -EINVAL; } if (base & size_or_mask || size & size_or_mask) { - printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); + pr_warning("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -341,36 +365,40 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* No CPU hotplug when we change MTRR entries */ get_online_cpus(); - /* Search for existing MTRR */ + + /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); - if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) continue; - /* At this point we know there is some kind of overlap/enclosure */ + /* + * At this point we know there is some kind of + * overlap/enclosure + */ if (base < lbase || base + size - 1 > lbase + lsize - 1) { - if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { /* New region encloses an existing region */ if (type == ltype) { replace = replace == -1 ? i : -2; continue; - } - else if (types_compatible(type, ltype)) + } else if (types_compatible(type, ltype)) continue; } - printk(KERN_WARNING - "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, - lsize); + pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); goto out; } - /* New region is enclosed by an existing region */ + /* New region is enclosed by an existing region */ if (ltype != type) { if (types_compatible(type, ltype)) continue; - printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, mtrr_attrib_to_str(ltype), - mtrr_attrib_to_str(type)); + pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); goto out; } if (increment) @@ -378,7 +406,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; goto out; } - /* Search for an empty MTRR */ + /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); @@ -393,8 +421,9 @@ int mtrr_add_page(unsigned long base, unsigned long size, mtrr_usage_table[replace] = 0; } } - } else - printk(KERN_INFO "mtrr: no more MTRRs available\n"); + } else { + pr_info("mtrr: no more MTRRs available\n"); + } error = i; out: mutex_unlock(&mtrr_mutex); @@ -405,10 +434,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING - "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG - "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -416,66 +443,64 @@ static int mtrr_check(unsigned long base, unsigned long size) } /** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } +EXPORT_SYMBOL(mtrr_add); /** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; @@ -500,22 +525,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); goto out; } } if (reg >= max) { - printk(KERN_WARNING "mtrr: register: %d too big\n", reg); + pr_warning("mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); + pr_warning("mtrr: MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + pr_warning("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) @@ -526,33 +551,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) put_online_cpus(); return error; } + /** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - -int -mtrr_del(int reg, unsigned long base, unsigned long size) +int mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } - -EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); -/* HACK ALERT! +/* + * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ @@ -576,29 +599,28 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device * sysdev, pm_message_t state) +static int mtrr_save(struct sys_device *sysdev, pm_message_t state) { int i; for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, - &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); } return 0; } -static int mtrr_restore(struct sys_device * sysdev) +static int mtrr_restore(struct sys_device *sysdev) { int i; for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) - set_mtrr(i, - mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } } return 0; } @@ -615,26 +637,29 @@ int __initdata changed_by_mtrr_cleanup; /** * mtrr_bp_init - initialize mtrrs on the boot CPU * - * This needs to be called early; before any of the other CPUs are + * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). - * + * */ void __init mtrr_bp_init(void) { u32 phys_addr; + init_ifs(); phys_addr = 32; if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ + size_or_mask = 0xff000000; /* 36 bits */ size_and_mask = 0x00f00000; phys_addr = 36; - /* This is an AMD specific MSR, but we assume(hope?) that - Intel will implement it to when they extend the address - bus of the Xeon. */ + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it to when they extend the address + * bus of the Xeon. + */ if (cpuid_eax(0x80000000) >= 0x80000008) { phys_addr = cpuid_eax(0x80000008) & 0xff; /* CPUID workaround for Intel 0F33/0F34 CPU */ @@ -649,9 +674,11 @@ void __init mtrr_bp_init(void) size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { - /* VIA C* family have Intel style MTRRs, but - don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = 0xfff00000; /* 32 bits */ size_and_mask = 0; phys_addr = 32; } @@ -694,30 +721,28 @@ void __init mtrr_bp_init(void) changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); } - } } } void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, - * but this routine will be called in cpu boot time, holding the lock - * breaks it. This routine is called in two cases: 1.very earily time - * of software resume, when there absolutely isn't mtrr entry changes; - * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to - * prevent mtrr entry changes + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -728,23 +753,55 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = true; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = false; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; + if (use_intel()) { if (!changed_by_mtrr_cleanup) mtrr_state_warn(); - } else { - /* The CPUs haven't MTRR and seem to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * TBD: is there any system with such CPU which supports - * suspend/resume? if no, we should remove the code. - */ - sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); + return 0; } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + return 0; } subsys_initcall(mtrr_init_finialize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7538b76..a501dee 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -1,5 +1,5 @@ /* - * local mtrr defines. + * local MTRR defines. */ #include <linux/types.h> @@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; u32 use_intel_if; -// void (*init)(void); void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type); + unsigned long *size, mtrr_type *type); int (*get_free_region)(unsigned long base, unsigned long size, int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, @@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { - unsigned long flags; - unsigned long cr4val; - u32 deftype_lo; - u32 deftype_hi; - u32 ccr3; + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; }; void set_mtrr_done(struct set_mtrr_context *ctxt); @@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops * ops); +extern void set_mtrr_ops(struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops * mtrr_if; +extern struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 1f5fb15..dfc80b4 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -1,24 +1,25 @@ -#include <linux/mm.h> #include <linux/init.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/msr.h> +#include <linux/io.h> +#include <linux/mm.h> + #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> -#include "mtrr.h" +#include <asm/mtrr.h> +#include <asm/msr.h> +#include "mtrr.h" -/* Put the processor into a state where MTRRs can be safely set */ +/* Put the processor into a state where MTRRs can be safely set */ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) { unsigned int cr0; - /* Disable interrupts locally */ + /* Disable interrupts locally */ local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { - /* Save value of CR4 and clear Page Global Enable (bit 7) */ + /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { ctxt->cr4val = read_cr4(); write_cr4(ctxt->cr4val & ~X86_CR4_PGE); @@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) write_cr0(cr0); wbinvd(); - if (use_intel()) - /* Save MTRR state */ + if (use_intel()) { + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else were excluded at the top */ + } else { + /* + * Cyrix ARRs - + * everything else were excluded at the top + */ ctxt->ccr3 = getCx86(CX86_CCR3); + } } } void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { - if (use_intel()) - /* Disable MTRRs, and set the default type to uncached */ + if (use_intel()) { + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); - else if (is_cpu(CYRIX)) - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } else { + if (is_cpu(CYRIX)) { + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } + } } -/* Restore the processor after a set_mtrr_prepare */ +/* Restore the processor after a set_mtrr_prepare */ void set_mtrr_done(struct set_mtrr_context *ctxt) { if (use_intel() || is_cpu(CYRIX)) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); - /* Restore MTRRdefType */ - if (use_intel()) + /* Restore MTRRdefType */ + if (use_intel()) { /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else was excluded at the top */ + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, + ctxt->deftype_hi); + } else { + /* + * Cyrix ARRs - + * everything else was excluded at the top + */ setCx86(CX86_CCR3, ctxt->ccr3); + } - /* Enable caches */ + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ + /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(ctxt->cr4val); } - /* Re-enable interrupts locally (if enabled previously) */ + /* Re-enable interrupts locally (if enabled previously) */ local_irq_restore(ctxt->flags); } - diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 275bc14..2732e2c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -6,6 +6,7 @@ * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * * For licencing details see kernel-base/COPYING */ @@ -19,6 +20,8 @@ #include <linux/kdebug.h> #include <linux/sched.h> #include <linux/uaccess.h> +#include <linux/highmem.h> +#include <linux/cpu.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -26,12 +29,52 @@ static u64 perf_counter_mask __read_mostly; +/* The maximal number of PEBS counters: */ +#define MAX_PEBS_COUNTERS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; +}; + struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; + struct debug_store *ds; }; /* @@ -54,8 +97,11 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + int apic; u64 max_period; u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); }; static struct x86_pmu x86_pmu __read_mostly; @@ -65,6 +111,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { }; /* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int event) +{ + return p6_perfmon_event_map[event]; +} + +/* + * Counter setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_COUNTER 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_COUNTER_MASK) + + return event & P6_EVNTSEL_MASK; +} + + +/* * Intel PerfMon v3. Used on Core2 and later. */ static const u64 intel_perfmon_event_map[] = @@ -389,23 +481,23 @@ static u64 intel_pmu_raw_event(u64 event) return event & CORE_EVNTSEL_MASK; } -static const u64 amd_0f_hw_cache_event_ids +static const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ }, }, [ C(L1I ) ] = { @@ -418,17 +510,17 @@ static const u64 amd_0f_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ [ C(RESULT_MISS) ] = 0, }, }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -438,8 +530,8 @@ static const u64 amd_0f_hw_cache_event_ids }, [ C(DTLB) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -529,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter, u64 prev_raw_count, new_raw_count; s64 delta; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * Careful: an NMI might modify the previous counter value. * @@ -566,6 +661,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) @@ -580,9 +676,11 @@ static bool reserve_pmc_hardware(void) if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); @@ -597,10 +695,12 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; for (i = 0; i < x86_pmu.num_counters; i++) { @@ -610,12 +710,113 @@ static void release_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif +} + +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_counters, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_counters, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_counters, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; } static void hw_perf_counter_destroy(struct perf_counter *counter) { if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); + release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -658,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) return 0; } +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + /* * Setup the hardware configuration for a given attr_type */ @@ -665,6 +902,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; + u64 config; int err; if (!x86_pmu_initialized()) @@ -673,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter) err = 0; if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) - err = -EBUSY; - else + if (atomic_read(&active_counters) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + err = reserve_bts_hardware(); + } + if (!err) atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } @@ -700,6 +942,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } counter->destroy = hw_perf_counter_destroy; @@ -717,17 +968,68 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (attr->config >= x86_pmu.max_events) return -EINVAL; + /* * The generic map: */ - hwc->config |= x86_pmu.event_map(attr->config); + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + /* + * Branch tracing: + */ + if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && + (hwc->sample_period == 1)) { + /* BTS is not supported by this architecture. */ + if (!bts_available()) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + return -EOPNOTSUPP; + } + + hwc->config |= config; return 0; } +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_disable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); } static void amd_pmu_disable_all(void) @@ -766,9 +1068,44 @@ void hw_perf_disable(void) return x86_pmu.disable_all(); } +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_enable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_counter *counter = + cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!counter)) + return; + + intel_pmu_enable_bts(counter->hw.config); + } } static void amd_pmu_enable_all(void) @@ -783,13 +1120,13 @@ static void amd_pmu_enable_all(void) barrier(); for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_counter *counter = cpuc->counters[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - continue; + + val = counter->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -818,16 +1155,13 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, - hwc->config); + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void @@ -835,18 +1169,34 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; - int err; mask = 0xfULL << (idx * 4); rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; - err = checking_wrmsrl(hwc->config_base, ctrl_val); + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val = P6_NOP_COUNTER; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); } static inline void intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc, idx); return; @@ -861,7 +1211,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -875,6 +1225,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, s64 period = hwc->sample_period; int err, ret = 0; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * If we are way outside a reasoable range then just skip forward: */ @@ -900,7 +1253,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -911,6 +1264,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + perf_counter_update_userpage(counter); + return ret; } @@ -940,8 +1295,29 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } +static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + + static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_counters).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc, idx); return; @@ -956,8 +1332,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); - else - x86_pmu_disable_counter(hwc, idx); } static int @@ -965,17 +1339,15 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; - if (!x86_pmu.num_counters_fixed) - return -1; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; - /* - * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86_model == 28) - return -1; + if (unlikely((event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; + if (!x86_pmu.num_counters_fixed) + return -1; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; @@ -997,7 +1369,15 @@ static int x86_pmu_enable(struct perf_counter *counter) int idx; idx = fixed_mode_idx(counter, hwc); - if (idx >= 0) { + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* BTS is already occupied. */ + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + hwc->config_base = 0; + hwc->counter_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { /* * Try to get the fixed counter, if that is already taken * then try to get a generic counter: @@ -1040,6 +1420,8 @@ try_generic: x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); + perf_counter_update_userpage(counter); + return 0; } @@ -1088,7 +1470,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1106,6 +1488,44 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, + struct perf_sample_data *data) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + unsigned long orig_ip = data->regs->ip; + struct bts_record *at, *top; + + if (!counter) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + ds->bts_index = ds->bts_buffer_base; + + for (; at < top; at++) { + data->regs->ip = at->from; + data->addr = at->to; + + perf_counter_output(counter, 1, data); + } + + data->regs->ip = orig_ip; + data->addr = 0; + + /* There's new data available. */ + counter->pending_kill = POLL_IN; +} + static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1130,8 +1550,19 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct perf_sample_data data; + struct pt_regs regs; + + data.regs = ®s; + intel_pmu_drain_bts_buffer(cpuc, &data); + } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); + + perf_counter_update_userpage(counter); } /* @@ -1155,6 +1586,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter) static void intel_pmu_reset(void) { + struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; unsigned long flags; int idx; @@ -1172,10 +1604,55 @@ static void intel_pmu_reset(void) for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } + if (ds) + ds->bts_index = ds->bts_buffer_base; local_irq_restore(flags); } +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx, handled = 0; + u64 val; + + data.regs = regs; + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_counters); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + counter = cpuc->counters[idx]; + hwc = &counter->hw; + + val = x86_perf_counter_update(counter, hwc, idx); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + p6_pmu_disable_counter(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} /* * This handler is triggered by the local APIC, so the APIC IRQ handling @@ -1185,16 +1662,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_counters *cpuc; - int bit, cpu, loops; + int bit, loops; u64 ack, status; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); + intel_pmu_drain_bts_buffer(cpuc, &data); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1223,6 +1700,8 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; + data.period = counter->hw.last_period; + if (perf_counter_overflow(counter, 1, &data)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -1247,14 +1726,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int cpu, idx, handled = 0; + int idx, handled = 0; u64 val; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) @@ -1297,18 +1775,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_counter_pending(void) { +#ifdef CONFIG_X86_LOCAL_APIC apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } void perf_counters_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes @@ -1332,7 +1814,9 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two * counters could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1351,6 +1835,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_counter, + .disable = p6_pmu_disable_counter, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_counters = 2, + /* + * Counters have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a counter for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .counter_bits = 32, + .counter_mask = (1ULL << 32) - 1, +}; + static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1363,12 +1874,15 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of * the generic counter period: */ .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, }; static struct x86_pmu amd_pmu = { @@ -1386,10 +1900,43 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; +static int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + if (!cpu_has_apic) { + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; + } + + return 0; +} + static int intel_pmu_init(void) { union cpuid10_edx edx; @@ -1398,8 +1945,14 @@ static int intel_pmu_init(void) unsigned int ebx; int version; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { return -ENODEV; + } + } /* * Check whether the Architectural PerfMon supports @@ -1425,8 +1978,6 @@ static int intel_pmu_init(void) */ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - /* * Install the hw-cache-events table: */ @@ -1459,18 +2010,16 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + x86_pmu = amd_pmu; - switch (boot_cpu_data.x86) { - case 0x0f: - case 0x10: - case 0x11: - memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); - pr_cont("AMD Family 0f/10/11 events, "); - break; - } return 0; } @@ -1498,21 +2047,22 @@ void __init init_hw_perf_counters(void) pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; } perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_counter_mask; perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); @@ -1554,14 +2104,15 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) */ static inline -void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +void callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < MAX_STACK_DEPTH) + if (entry->nr < PERF_MAX_STACK_DEPTH) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); static void @@ -1577,14 +2128,19 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - /* Don't bother with IRQ stacks for now */ - return -1; + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + + return 0; } static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + if (reliable) callchain_store(entry, addr); } @@ -1596,47 +2152,59 @@ static const struct stacktrace_ops backtrace_ops = { .address = backtrace_address, }; +#include "../dumpstack.h" + static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bp; - char *stack; - int nr = entry->nr; + callchain_store(entry, PERF_CONTEXT_KERNEL); + callchain_store(entry, regs->ip); - callchain_store(entry, instruction_pointer(regs)); + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - bp = frame_pointer(regs); -#else - bp = 0; -#endif +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; - entry->kernel = entry->nr - nr; -} + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) { - int ret; + unsigned long bytes; - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; + bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); - - return ret; + return bytes == sizeof(*frame); } static void @@ -1644,28 +2212,28 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; - int nr = entry->nr; - regs = (struct pt_regs *)current->thread.sp0 - 1; - fp = (void __user *)regs->bp; + if (!user_mode(regs)) + regs = task_pt_regs(current); + fp = (void __user *)regs->bp; + + callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); - while (entry->nr < MAX_STACK_DEPTH) { - frame.next_fp = NULL; + while (entry->nr < PERF_MAX_STACK_DEPTH) { + frame.next_frame = NULL; frame.return_address = 0; if (!copy_stack_frame(fp, &frame)) break; - if ((unsigned long)fp < user_stack_pointer(regs)) + if ((unsigned long)fp < regs->sp) break; callchain_store(entry, frame.return_address); - fp = frame.next_fp; + fp = frame.next_frame; } - - entry->user = entry->nr - nr; } static void @@ -1696,16 +2264,18 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; - entry->hv = 0; - entry->kernel = 0; - entry->user = 0; perf_do_callchain(regs, entry); return entry; } + +void hw_perf_counter_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 5c481f6..392bea4 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); + return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); + return msr - MSR_ARCH_PERFMON_PERFCTR0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_PERFCTR0); + return msr - MSR_P6_PERFCTR0; case 15: - return (msr - MSR_P4_BPU_PERFCTR0); + return msr - MSR_P4_BPU_PERFCTR0; } } return 0; @@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); + return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + return msr - MSR_ARCH_PERFMON_EVENTSEL0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_EVNTSEL0); + return msr - MSR_P6_EVNTSEL0; case 15: - return (msr - MSR_P4_BSU_ESCR0); + return msr - MSR_P4_BSU_ESCR0; } } return 0; @@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) { BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } /* checks the an msr for availability */ @@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) counter = nmi_perfctr_msr_to_bit(msr); BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); @@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) */ counter_val = (u64)cpu_khz * 1000; do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { + if (counter_val > 0x7fffffffULL) { u64 count = (u64)cpu_khz * 1000; do_div(count, 0x7fffffffUL); retval = count + 1; @@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } @@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) apic_write(APIC_LVTPC, APIC_DM_NMI); /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); + write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); } static const struct wd_ops p6_wd_ops = { @@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz) if (smp_num_siblings == 2) { unsigned int ebx, apicid; - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; } else #endif ht_num = 0; @@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz) } evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS + | P4_ESCR_OS | P4_ESCR_USR; cccr_val |= P4_CCCR_THRESHOLD(15) @@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* - * P4 quirks: + * P4 quirks: * - An overflown perfctr will assert its interrupt * until the OVF flag in its CCCR is cleared. * - LVTPC is masked on interrupt and must be @@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) * NOTE: Corresponding bit = 0 in ebx indicates event present. */ cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + if ((eax.split.mask_length < + (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) return 0; @@ -803,8 +804,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz) wd_ops->rearm(wd, nmi_hz); return 1; } - -int lapic_watchdog_ok(void) -{ - return wd_ops != NULL; -} diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e3039..62ac8cb 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); -#ifdef CONFIG_X86_64 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); -#endif seq_printf(m, "power management:"); for (i = 0; i < 32; i++) { @@ -128,7 +126,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (i < ARRAY_SIZE(x86_power_flags) && x86_power_flags[i]) seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", + x86_power_flags[i][0] ? " " : "", x86_power_flags[i]); else seq_printf(m, " [%d]", i); diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 0000000..a640ae5 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,55 @@ +#include <linux/sched.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/irqflags.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); + unsigned long ratio, flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + ratio = calc_aperfmperf_ratio(old, &val); + *old = val; + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} + +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 284c399..bc24f51 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -49,17 +49,17 @@ static inline int __vmware_platform(void) static unsigned long __vmware_get_tsc_khz(void) { - uint64_t tsc_hz; - uint32_t eax, ebx, ecx, edx; + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - return tsc_hz; + if (ebx == UINT_MAX) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; } /* diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2ac1f0c..b07af88 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; +static char *cpuid_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + static int __init cpuid_init(void) { int i, err = 0; @@ -198,6 +203,7 @@ static int __init cpuid_init(void) err = PTR_ERR(cpuid_class); goto out_chrdev; } + cpuid_class->nodename = cpuid_nodename; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index b4f14c6..37250fe 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -27,9 +27,7 @@ static void doublefault_fn(void) if (ptr_ok(gdt)) { gdt += GDT_ENTRY_TSS << 3; - tss = *(u16 *)(gdt+2); - tss += *(u8 *)(gdt+4) << 16; - tss += *(u8 *)(gdt+7) << 24; + tss = get_desc_base((struct desc_struct *)gdt); printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 48bfe13..ef42a03 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -509,15 +509,15 @@ enum bts_field { bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; -static inline unsigned long bts_get(const char *base, enum bts_field field) +static inline unsigned long bts_get(const char *base, unsigned long field) { base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } -static inline void bts_set(char *base, enum bts_field field, unsigned long val) +static inline void bts_set(char *base, unsigned long field, unsigned long val) { - base += (ds_cfg.sizeof_ptr_field * field);; + base += (ds_cfg.sizeof_ptr_field * field); (*(unsigned long *)base) = val; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa..2d8a371 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -15,13 +15,13 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/sysfs.h> -#include <linux/ftrace.h> #include <asm/stacktrace.h> #include "dumpstack.h" int panic_on_unrecovered_nmi; +int panic_on_io_nmi; unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d593cd1..bca5fba 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -19,6 +19,12 @@ #include "dumpstack.h" +/* Just a stub for now */ +int x86_is_stack_id(int id, char *name) +{ + return 0; +} + void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d35db59..54b0a32 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -19,10 +19,8 @@ #include "dumpstack.h" -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { + +static char x86_stack_ids[][8] = { [DEBUG_STACK - 1] = "#DB", [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", @@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; + +int x86_is_stack_id(int id, char *name) +{ + return x86_stack_ids[id - 1] == name; +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ unsigned k; /* @@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, if (*usedp & (1U << k)) break; *usedp |= 1U << k; - *idp = ids[k]; + *idp = x86_stack_ids[k]; return (unsigned long *)end; } /* @@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, do { ++j; end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + x86_stack_ids[j][4] = '1' + + (j - N_EXCEPTION_STACKS); } while (stack < end - EXCEPTION_STKSZ); if (*usedp & (1U << j)) break; *usedp |= 1U << j; - *idp = ids[j]; + *idp = x86_stack_ids[j]; return (unsigned long *)end; } #endif diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7271fa3..147005a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, { int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { + if (x >= ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } @@ -627,10 +627,9 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); } #endif @@ -1383,6 +1382,8 @@ static unsigned long ram_alignment(resource_size_t pos) return 32*1024*1024; } +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + void __init e820_reserve_resources_late(void) { int i; @@ -1400,17 +1401,19 @@ void __init e820_reserve_resources_late(void) * avoid stolen RAM: */ for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; - resource_size_t start, end; + struct e820entry *entry = &e820.map[i]; + u64 start, end; if (entry->type != E820_RAM) continue; start = entry->addr + entry->size; - end = round_up(start, ram_alignment(start)); - if (start == end) + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) continue; - reserve_region_with_split(&iomem_resource, start, - end - 1, "RAM buffer"); + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); } } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 96f7ac0..fe26ba3 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -354,7 +354,7 @@ void __init efi_init(void) */ c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); if (c16) { - for (i = 0; i < sizeof(vendor) && *c16; ++i) + for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else @@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void) && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else - va = efi_ioremap(md->phys_addr, size); + va = efi_ioremap(md->phys_addr, size, md->type); md->virt_addr = (u64) (unsigned long) va; diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 22c3b78..ac0621a 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void) early_runtime_code_mapping_set_exec(0); } -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, + u32 type) { unsigned long last_map_pfn; + if (type == EFI_MEMORY_MAPPED_IO) + return ioremap(phys_addr, size); + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add..c097e7d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -48,7 +48,6 @@ #include <asm/segment.h> #include <asm/smp.h> #include <asm/page_types.h> -#include <asm/desc.h> #include <asm/percpu.h> #include <asm/dwarf2.h> #include <asm/processor-flags.h> @@ -84,7 +83,7 @@ #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else #define preempt_stop(clobbers) -#define resume_kernel restore_nocheck +#define resume_kernel restore_all #endif .macro TRACE_IRQS_IRET @@ -372,7 +371,7 @@ END(ret_from_exception) ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_nocheck + jnz restore_all need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl @@ -540,6 +539,8 @@ syscall_exit: jne syscall_exit_work restore_all: + TRACE_IRQS_IRET +restore_all_notrace: movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -551,8 +552,6 @@ restore_all: CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: - TRACE_IRQS_IRET -restore_nocheck_notrace: RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: @@ -588,22 +587,34 @@ ldt_ss: jne restore_nocheck #endif - /* If returning to userspace with 16bit stack, - * try to fix the higher word of ESP, as the CPU - * won't restore it. - * This is an "official" bug of all the x86-compatible - * CPUs, which we can try to work around to make - * dosemu and wine happy. */ - movl PT_OLDESP(%esp), %eax - movl %esp, %edx - call patch_espfix_desc +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + PER_CPU(gdt_page, %ebx) + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 - pushl %eax + push %eax /* new kernel esp */ CFI_ADJUST_CFA_OFFSET 4 + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) - TRACE_IRQS_OFF - lss (%esp), %esp + lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck CFI_ENDPROC @@ -716,15 +727,24 @@ PTREGSCALL(vm86) PTREGSCALL(vm86old) .macro FIXUP_ESPFIX_STACK - /* since we are on a wrong stack, we cant make it a C code :( */ +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ PER_CPU(gdt_page, %ebx) - GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - addl %esp, %eax + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - lss (%esp), %esp + lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm .macro UNWIND_ESPFIX_STACK @@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller) pushl %edx movl 0xc(%esp), %edx lea 0x4(%ebp), %eax + movl (%ebp), %ecx subl $MCOUNT_INSN_SIZE, %edx call prepare_ftrace_return popl %edx @@ -1168,6 +1189,7 @@ return_to_handler: pushl %eax pushl %ecx pushl %edx + movl %ebp, %eax call ftrace_return_to_handler movl %eax, 0xc(%esp) popl %edx @@ -1329,7 +1351,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_nocheck_notrace + jmp restore_all_notrace CFI_ENDPROC nmi_stack_fixup: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index de74f0a..d59fe32 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller) leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi + movq (%rbp), %rdx subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return @@ -145,18 +146,19 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) movq %rdx, 8(%rsp) + movq %rbp, %rdi call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, 16(%rsp) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp + addq $16, %rsp retq #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b79c553..9dbb527 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer) { unsigned long old; int faulted; @@ -416,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) unsigned long return_hooker = (unsigned long) &return_to_handler; - /* Nmi's are currently unsupported */ - if (unlikely(in_nmi())) - return; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; @@ -453,7 +450,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { + if (ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer) == -EBUSY) { *parent = old; return; } @@ -496,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) struct syscall_metadata *syscall_nr_to_meta(int nr) { - if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; return syscalls_metadata[nr]; } -void arch_init_ftrace_syscalls(void) +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < NR_syscalls; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + +static int __init arch_init_ftrace_syscalls(void) { int i; struct syscall_metadata *meta; unsigned long **psys_syscall_table = &sys_call_table; - static atomic_t refs; - - if (atomic_inc_return(&refs) != 1) - goto end; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - FTRACE_SYSCALL_MAX, GFP_KERNEL); + NR_syscalls, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return; + return -ENOMEM; } - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } - return; - - /* Paranoid: avoid overflow */ -end: - atomic_dec(&refs); + return 0; } +arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index dc5ed4b..7ffec6b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -13,7 +13,6 @@ #include <asm/segment.h> #include <asm/page_types.h> #include <asm/pgtable_types.h> -#include <asm/desc.h> #include <asm/cache.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> @@ -262,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * which will be freed later */ -#ifndef CONFIG_HOTPLUG_CPU -.section .init.text,"ax",@progbits -#endif +__CPUINIT #ifdef CONFIG_SMP ENTRY(startup_32_smp) @@ -442,7 +439,6 @@ is386: movl $2,%ecx # set MP jne 1f movl $per_cpu__gdt_page,%eax movl $per_cpu__stack_canary,%ecx - subl $20, %ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -603,7 +599,7 @@ ignore_int: #endif iret -.section .cpuinit.data,"wa" + __REFDATA .align 4 ENTRY(initial_code) .long i386_start_kernel diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 54b29bb..fa54f78 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -12,7 +12,6 @@ #include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> -#include <asm/desc.h> #include <asm/segment.h> #include <asm/pgtable.h> #include <asm/page.h> diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c2e0bb0..5cf36c0 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -7,6 +7,7 @@ #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/module.h> +#include <linux/timex.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/io.h> diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index df3bf26..270ff83 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -12,7 +12,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 3b09634..7d35d0f 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) void fixup_irqs(void) { unsigned int irq; - static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { @@ -236,8 +235,8 @@ void fixup_irqs(void) } if (desc->chip->set_affinity) desc->chip->set_affinity(irq, affinity); - else if (desc->action && !(warned++)) - printk("Cannot set affinity for irq %i\n", irq); + else if (desc->action) + printk_once("Cannot set affinity for irq %i\n", irq); } #if 0 diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 8a194ad..ccf8ab5 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -187,7 +187,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THERMAL_VECTOR alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif -#ifdef CONFIG_X86_THRESHOLD +#ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a78ecad..63b0ec8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,7 +34,6 @@ struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; - enum paravirt_lazy_mode mode; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); @@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) { struct kvm_para_state *state = kvm_para_state(); - if (state->mode != PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { kvm_mmu_op(buffer, len); return; } @@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) static void kvm_enter_lazy_mmu(void) { - struct kvm_para_state *state = kvm_para_state(); - paravirt_enter_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void kvm_leave_lazy_mmu(void) @@ -197,10 +193,9 @@ static void kvm_leave_lazy_mmu(void) mmu_queue_flush(state); paravirt_leave_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } -static void paravirt_ops_setup(void) +static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; pv_info.paravirt_enabled = 1; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43..e5efcdc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void) struct timespec ts; int low, high; - low = (int)__pa(&wall_clock); - high = ((u64)__pa(&wall_clock) >> 32); + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(MSR_KVM_WALL_CLOCK, low, high); vcpu_time = &get_cpu_var(hv_clock); diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 846510b..2a62d84 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id) static struct irqaction mfgptirq = { .handler = mfgpt_tick, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, .name = "mfgpt-timer" }; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9c44615..9371448 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = { static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", + .devnode = "cpu/microcode", .fops = µcode_fops, }; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 651c93b..fcd513b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -482,11 +482,11 @@ static void __init construct_ioapic_table(int mpc_default_type) MP_bus_info(&bus); } - ioapic.type = MP_IOAPIC; - ioapic.apicid = 2; - ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.flags = MPC_APIC_USABLE; - ioapic.apicaddr = 0xFEC00000; + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; MP_ioapic_info(&ioapic); /* diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3cf3413..7dd9500 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -1,6 +1,7 @@ /* ----------------------------------------------------------------------- * * * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * Copyright 2009 Intel Corporation; author: H. Peter Anvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -80,11 +81,8 @@ static ssize_t msr_read(struct file *file, char __user *buf, for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } if (copy_to_user(tmp, &data, 8)) { err = -EFAULT; break; @@ -115,11 +113,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf, break; } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } tmp += 2; bytes += 8; } @@ -127,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf, return bytes ? bytes : err; } +static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + int cpu = iminor(file->f_path.dentry->d_inode); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); @@ -157,6 +200,8 @@ static const struct file_operations msr_fops = { .read = msr_read, .write = msr_write, .open = msr_open, + .unlocked_ioctl = msr_ioctl, + .compat_ioctl = msr_ioctl, }; static int __cpuinit msr_device_create(int cpu) @@ -196,6 +241,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; +static char *msr_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + static int __init msr_init(void) { int i, err = 0; @@ -212,6 +262,7 @@ static int __init msr_init(void) err = PTR_ERR(msr_class); goto out_chrdev; } + msr_class->nodename = msr_nodename; for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b9..f5b0b4a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -362,8 +362,9 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .read_msr_amd = native_read_msr_amd_safe, + .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, + .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 328592f..64b838e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -3,6 +3,7 @@ #include <linux/dmar.h> #include <linux/bootmem.h> #include <linux/pci.h> +#include <linux/kmemleak.h> #include <asm/proto.h> #include <asm/dma.h> @@ -32,6 +33,15 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA ranslation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; + dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -86,6 +96,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else @@ -145,7 +160,7 @@ again: return NULL; addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, get_order(size)); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { @@ -210,6 +225,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif + if (!strncmp(p, "pt", 2)) + iommu_pass_through = 1; gart_parse_options(p); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index cfd9f90..98a827e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || - !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return force_iommu || !dma_capable(dev, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return !dma_capable(dev, addr, size); } /* Map a single continuous physical area into the IOMMU. @@ -675,7 +674,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "falling back to iommu=soft.\n"); return -1; } diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 71d412a..a3933d4 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { + if (hwdev && !dma_capable(hwdev, bus, size)) { if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", @@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, - .map_sg = nommu_map_sg, - .map_page = nommu_map_page, - .is_phys = 1, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, + .map_sg = nommu_map_sg, + .map_page = nommu_map_page, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, }; void __init no_iommu_init(void) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index a1712f2..e8a3501 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,31 +13,6 @@ int swiotlb __read_mostly; -void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - -void *swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr; -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return baddr; -} - -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { @@ -71,7 +46,8 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || + iommu_pass_through) swiotlb = 1; #endif if (swiotlb_force) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3bb2be1..071166a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,7 +63,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC, NULL); + SLAB_PANIC | SLAB_NOTRACK, NULL); } /* @@ -519,16 +519,12 @@ static void c1e_idle(void) if (!cpumask_test_cpu(cpu, c1e_mask)) { cpumask_set_cpu(cpu, c1e_mask); /* - * Force broadcast so ACPI can not interfere. Needs - * to run with interrupts enabled as it uses - * smp_function_call. + * Force broadcast so ACPI can not interfere. */ - local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); - local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524..4cf7956 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,9 +61,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ @@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb54..ad535b6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,9 +55,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); @@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + bool preload_fpu; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* Must be after DS reload */ + unlazy_fpu(prev_p); + + /* Make sure cpu is ready for new context */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - /* Must be after DS reload */ - unlazy_fpu(prev_p); - /* * Switch the PDA and FPU contexts. */ @@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde..8d7d5c9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -35,10 +35,11 @@ #include <asm/proto.h> #include <asm/ds.h> -#include <trace/syscall.h> - #include "tls.h" +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -1497,8 +1498,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_enter(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1523,8 +1524,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_exit(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 4f9c55f..03801f2 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ +#elif defined(__x86_64__) __asm__ ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index af71d06..6c3b2c6 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) pci_read_config_dword(nb_ht, 0x60, &val); set_dev_node(&dev->dev, val & 7); - pci_dev_put(dev); + pci_dev_put(nb_ht); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d1ce8..27349f9 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,8 @@ #include <linux/init.h> #include <linux/pm.h> #include <linux/efi.h> +#include <linux/dmi.h> +#include <linux/tboot.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -17,7 +19,6 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -# include <linux/dmi.h> # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else @@ -249,6 +250,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, + { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ + .callback = set_bios_reboot, + .ident = "CompuLab SBC-FITPC2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), + DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), + }, + }, { } }; @@ -396,6 +405,46 @@ EXPORT_SYMBOL(machine_real_restart); #endif /* CONFIG_X86_32 */ +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + }, + }, + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + }, + }, + { } +}; + +static int __init pci_reboot_init(void) +{ + dmi_check_system(pci_reboot_dmi_table); + return 0; +} +core_initcall(pci_reboot_init); + static inline void kb_wait(void) { int i; @@ -460,6 +509,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -586,6 +637,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -597,6 +650,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index be5ae80..19f15c4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include <linux/percpu.h> #include <linux/crash_dump.h> +#include <linux/tboot.h> #include <video/edid.h> @@ -289,6 +290,20 @@ void * __init extend_brk(size_t size, size_t align) return ret; } +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -658,6 +673,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, + { + /* + * AMI BIOS with low memory corruption was found on Intel DG45ID board. + * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * match only DMI_BOARD_NAME and see if there is more bad products + * with this vendor. + */ + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), + }, + }, #endif {} }; @@ -684,6 +712,21 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + + parse_early_param(); + /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); @@ -766,11 +809,6 @@ void __init setup_arch(char **cmdline_p) #endif #endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - parse_early_param(); - #ifdef CONFIG_X86_64 check_efer(); #endif @@ -871,6 +909,8 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + init_gbpages(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); max_pfn_mapped = max_low_pfn_mapped; @@ -948,6 +988,8 @@ void __init setup_arch(char **cmdline_p) paravirt_pagetable_setup_done(swapper_pg_dir); paravirt_post_allocator_init(); + tboot_probe(); + #ifdef CONFIG_X86_64 map_vsyscall(); #endif diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f082..d559af9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -124,215 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Helpers for first chunk memory allocation */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; - -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpur_size) - return NULL; - - return virt_to_page(pcpur_ptrs[cpu] + off); + return pcpu_alloc_bootmem(cpu, size, align); } -static ssize_t __init setup_pcpu_remap(size_t static_size) -{ - static struct vm_struct vm; - size_t ptrs_size, dyn_size; - unsigned int cpu; - ssize_t ret; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. - */ - if (true || !cpu_has_pse || !pcpu_need_numa()) - return -EINVAL; - - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); - - for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) - goto enomem; - - /* - * Only use pcpur_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); - - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); - } - - /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd; - - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), - PAGE_KERNEL_LARGE)); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; - -enomem: - for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); - return ret; -} -#else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static void __init pcpu_fc_free(void *ptr, size_t size) { - return -EINVAL; + free_bootmem(__pa(ptr), size); } -#endif -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static ssize_t __init setup_pcpu_embed(size_t static_size) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!cpu_has_pse || pcpu_need_numa()) - return -EINVAL; - - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); -} - -/* - * 4k page allocator - * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +#else + return LOCAL_DISTANCE; +#endif } -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) -{ - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) - goto enomem; - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; -} - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -346,42 +168,51 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ - ret = setup_pcpu_remap(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); - if (ret < 0) - ret = setup_pcpu_4k(static_size); - if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); - - pcpu_unit_size = ret; +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc26ad4c..6a44a76 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda6..a25eeec 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -47,6 +47,7 @@ #include <linux/bootmem.h> #include <linux/err.h> #include <linux/nmi.h> +#include <linux/tboot.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -434,7 +435,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu) * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map */ - if (sched_mc_power_savings || sched_smt_power_savings) + if ((sched_mc_power_savings || sched_smt_power_savings) && + !(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else return c->llc_shared_map; @@ -1116,9 +1118,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1140,6 +1155,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; @@ -1317,6 +1333,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4aaf7e4..c3eb207 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); +void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +{ + dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index e8b9863..3149032 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/ptrace.h> +#include <asm/desc.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { @@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u32 *desc; + struct desc_struct *desc; unsigned long base; seg &= ~7UL; @@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re addr = -1L; /* bogus selector, access would fault */ else { desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); + base = get_desc_base(desc); /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) + if (!desc->d) addr &= 0xffff; addr += base; } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6bc211a..45e00eb 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,9 +18,9 @@ #include <asm/ia32.h> #include <asm/syscalls.h> -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) { long error; struct file *file; @@ -226,7 +226,7 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user *name) +SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) { int err; down_read(&uts_sem); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 0000000..86c9f91 --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,447 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dma_remapping.h> +#include <linux/init_task.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/dmar.h> +#include <linux/cpu.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <linux/tboot.h> + +#include <asm/trampoline.h> +#include <asm/processor.h> +#include <asm/bootparam.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/io.h> + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +static void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); +} + +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) +{ + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + + /* S3 resume code */ + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE + /* AP trampoline code */ + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + + /* kernel code + data + bss */ + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + if (tboot_setup_sleep()) + return; + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +static atomic_t ap_wfs_count; + +static int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + timeout = AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + timeout) { + mdelay(1); + timeout--; + } + + if (timeout) + pr_warning("tboot wait for APs timeout\n"); + + return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); +} + +static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_DYING: + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +static struct notifier_block tboot_cpu_notifier __cpuinitdata = +{ + .notifier_call = tboot_cpu_callback, +}; + +static __init int tboot_late_init(void) +{ + if (!tboot_enabled()) + return 0; + + tboot_create_trampoline(); + + atomic_set(&ap_wfs_count, 0); + register_hotcpu_notifier(&tboot_cpu_notifier); + return 0; +} + +late_initcall(tboot_late_init); + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 124d40c..503c1f2 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -640,13 +640,13 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); + proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, + &proc_uv_ptc_operations); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); return -EINVAL; } - proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; return 0; } @@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode) unsigned long pa; unsigned long m; unsigned long n; - unsigned long mmr_image; struct bau_desc *adp; struct bau_desc *ad2; @@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; - mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) { - uv_write_global_mmr64(pnode, (unsigned long) - UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - } + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | m)); /* * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each @@ -749,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode) * note that base_dest_nodeid is actually a nasid. */ ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; + ad2->header.dest_subnodeid = 0x10; /* the LB */ ad2->header.command = UV_NET_ENDPOINT_INTD; ad2->header.int_both = 1; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 71f9c74..8326492 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -45,6 +45,7 @@ #include <linux/edac.h> #endif +#include <asm/kmemcheck.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> @@ -75,7 +76,7 @@ char ignore_fpu_irq; * F0 0F bug workaround.. We have a special link segment * for this. */ -gate_desc idt_table[256] +gate_desc idt_table[NR_VECTORS] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif @@ -345,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); @@ -533,6 +537,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(condition, 6); + /* Catch kmemcheck conditions first of all! */ + if (condition & DR_STEP && kmemcheck_trap(regs)) + return; + /* * The processor cleared BTF, so don't mark that we need it set. */ @@ -778,33 +786,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; } -#endif -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +/* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) { + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } /* @@ -838,17 +847,8 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3e1c057..71f4368 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -9,6 +9,7 @@ #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/percpu.h> +#include <linux/timex.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -274,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequencty. */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0; for (count = 0; count < 50000; count++) { - /* Ignore LSB */ - inb(0x42); - if (inb(0x42) != val) + if (!pit_verify_msb(val)) break; tsc = get_cycles(); } @@ -335,8 +341,7 @@ static unsigned long quick_pit_calibrate(void) * to do that is to just read back the 16-bit counter * once from the PIT. */ - inb(0x42); - inb(0x42); + pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { @@ -347,8 +352,19 @@ static unsigned long quick_pit_calibrate(void) * Iterate until the error is less than 500 ppm */ delta -= tsc; - if (d1+d2 < delta >> 11) - goto success; + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; } } printk("Fast TSC calibration failed\n"); @@ -589,22 +605,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); */ DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - unsigned long long tsc_now, ns_now; + unsigned long long tsc_now, ns_now, *offset; unsigned long flags, *scale; local_irq_save(flags); sched_clock_idle_sleep_event(); scale = &per_cpu(cyc2ns, cpu); + offset = &per_cpu(cyc2ns_offset, cpu); rdtscll(tsc_now); ns_now = __cycles_2_ns(tsc_now); - if (cpu_khz) + if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + } sched_clock_idle_wakeup_event(0); local_irq_restore(flags); @@ -631,17 +651,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; + unsigned long *lpj; if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) return 0; - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + lpj = &boot_cpu_data.loops_per_jiffy; #ifdef CONFIG_SMP + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; #endif if (!ref_freq) { diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b263423..95a7289 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; ap.fs = __KERNEL_PERCPU; - ap.gs = 0; + ap.gs = __KERNEL_STACK_CANARY; ap.eflags = 0; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e878..0ccb57d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -46,11 +46,10 @@ PHDRS { data PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_X86_64 user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ + init PT_LOAD FLAGS(7); /* RWE */ #endif note PT_NOTE FLAGS(0); /* ___ */ } @@ -103,72 +102,43 @@ SECTIONS __stop___ex_table = .; } :text = 0x9090 - RODATA + RO_DATA(PAGE_SIZE) /* Data */ - . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Start of data section */ _sdata = .; - DATA_DATA - CONSTRUCTORS -#ifdef CONFIG_X86_64 - /* End of data section */ - _edata = .; -#endif - } :data + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) #ifdef CONFIG_X86_32 - /* 32 bit has nosave before _edata */ - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } + /* 32 bit has nosave before _edata */ + NOSAVE_DATA #endif - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) + PAGE_ALIGNED_DATA(PAGE_SIZE) *(.data.idt) - } -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -#endif - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } + CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) - /* rarely changed data like cpu maps */ -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); -#endif - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) + DATA_DATA + CONSTRUCTORS + + /* rarely changed data like cpu maps */ + READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) -#ifdef CONFIG_X86_32 /* End of data section */ _edata = .; -#endif - } + } :data #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) @@ -234,35 +204,29 @@ SECTIONS #endif /* CONFIG_X86_64 */ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ } -#ifdef CONFIG_X86_64 - :data.init -#endif +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * smp_locks might be freed after init - * start/end must be page aligned + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should + * start another segment - init. */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - } + PERCPU_VADDR(0, :percpu) +#endif - /* Init code and data - will be freed after init */ - . = ALIGN(PAGE_SIZE); .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; /* paired with __init_end */ _sinittext = .; INIT_TEXT _einittext = .; } +#ifdef CONFIG_X86_64 + :init +#endif .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA @@ -333,17 +297,7 @@ SECTIONS } #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif @@ -354,15 +308,22 @@ SECTIONS __init_end = .; } + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ + NOSAVE_DATA + } #endif /* BSS */ @@ -387,21 +348,18 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } #ifdef CONFIG_X86_32 -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -414,12 +372,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -427,7 +385,7 @@ ASSERT((per_cpu__irq_stack_union == 0), #ifdef CONFIG_KEXEC #include <asm/kexec.h> -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif |