diff options
Diffstat (limited to 'arch/x86/kernel')
50 files changed, 1059 insertions, 2338 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cb648c8..f4d9600 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-y += syscall_$(BITS).o +obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1dac942..86281ff 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -53,10 +53,6 @@ EXPORT_SYMBOL(acpi_disabled); # include <asm/proto.h> #endif /* X86 */ -#define BAD_MADT_ENTRY(entry, end) ( \ - (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ - ((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) - #define PREFIX "ACPI: " int acpi_noirq; /* skip ACPI IRQ initialization */ @@ -613,10 +609,10 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) int nid; nid = acpi_get_node(handle); - if (nid == -1 || !node_online(nid)) - return; - set_apicid_to_node(physid, nid); - numa_set_node(cpu, nid); + if (nid != -1) { + set_apicid_to_node(physid, nid); + numa_set_node(cpu, nid); + } #endif } @@ -907,10 +903,6 @@ static int __init acpi_parse_madt_lapic_entries(void) #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 -#ifdef CONFIG_X86_ES7000 -extern int es7000_plat; -#endif - void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { int ioapic; @@ -960,14 +952,6 @@ void __init mp_config_acpi_legacy_irqs(void) set_bit(MP_ISA_BUS, mp_bus_not_pci); pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); -#ifdef CONFIG_X86_ES7000 - /* - * Older generations of ES7000 have no legacy identity mappings - */ - if (es7000_plat == 1) - return; -#endif - /* * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index dec8de4..f04dbb3 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -22,6 +22,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); @@ -30,6 +31,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, {} }; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index fd972a3..9fa8aa0 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -18,7 +18,6 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/bitops.h> -#include <linux/ioport.h> #include <linux/suspend.h> #include <asm/e820.h> #include <asm/io.h> @@ -54,18 +53,6 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; -static struct resource gart_resource = { - .name = "GART", - .flags = IORESOURCE_MEM, -}; - -static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) -{ - gart_resource.start = aper_base; - gart_resource.end = aper_base + aper_size - 1; - insert_resource(&iomem_resource, &gart_resource); -} - /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ @@ -96,7 +83,6 @@ static u32 __init allocate_aperture(void) memblock_reserve(addr, aper_size); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, addr); - insert_aperture_resource((u32)addr, aper_size); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -444,12 +430,8 @@ int __init gart_iommu_hole_init(void) out: if (!fix && !fallback_aper_force) { - if (last_aper_base) { - unsigned long n = (32 * 1024 * 1024) << last_aper_order; - - insert_aperture_resource((u32)last_aper_base, n); + if (last_aper_base) return 1; - } return 0; } diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 0ae0323..dcb5b15 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,10 +18,7 @@ obj-y += apic_flat_64.o endif # APIC probe will depend on the listing order here -obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_SUMMIT) += summit_32.o obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o -obj-$(CONFIG_X86_ES7000) += es7000_32.o # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7f26c9a..481ae38 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -133,6 +133,10 @@ static inline void imcr_apic_to_pic(void) * +1=force-enable */ static int force_enable_local_apic __initdata; + +/* Control whether x2APIC mode is enabled or not */ +static bool nox2apic __initdata; + /* * APIC command line parameters */ @@ -162,8 +166,7 @@ int x2apic_mode; /* x2apic enabled before OS handover */ int x2apic_preenabled; static int x2apic_disabled; -static int nox2apic; -static __init int setup_nox2apic(char *str) +static int __init setup_nox2apic(char *str) { if (x2apic_enabled()) { int apicid = native_apic_msr_read(APIC_ID); @@ -178,7 +181,7 @@ static __init int setup_nox2apic(char *str) } else setup_clear_cpu_cap(X86_FEATURE_X2APIC); - nox2apic = 1; + nox2apic = true; return 0; } @@ -283,8 +286,12 @@ u32 native_safe_apic_wait_icr_idle(void) void native_apic_icr_write(u32 low, u32 id) { + unsigned long flags; + + local_irq_save(flags); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); + local_irq_restore(flags); } u64 native_apic_icr_read(void) @@ -2129,7 +2136,6 @@ int generic_processor_info(int apicid, int version) * * - arch/x86/kernel/mpparse.c: MP_processor_info() * - arch/x86/mm/amdtopology.c: amd_numa_init() - * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info() * * This function is executed with the modified * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 2c621a6..7c1b294 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -198,7 +198,7 @@ static struct apic apic_flat = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, @@ -314,7 +314,7 @@ static struct apic apic_physflat = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 191ce75..8c7c982 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -172,8 +172,7 @@ struct apic apic_noop = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, - + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 3e67f9e..a5b45df 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -248,7 +248,7 @@ static const struct apic apic_numachip __refconst = { .wakeup_secondary_cpu = numachip_wakeup_secondary, .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, /* REMRD not supported */ diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d50e364..e4840aa 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -199,8 +199,7 @@ static struct apic apic_bigsmp = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = default_wait_for_init_deassert, - + .wait_for_init_deassert = true, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c deleted file mode 100644 index c552247..0000000 --- a/arch/x86/kernel/apic/es7000_32.c +++ /dev/null @@ -1,746 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - * Natalie Protasevich, Unisys Corporation - * - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. - * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar - * - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/notifier.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/threads.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/gfp.h> -#include <linux/nmi.h> -#include <linux/smp.h> -#include <linux/io.h> - -#include <asm/apicdef.h> -#include <linux/atomic.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/ipi.h> - -/* - * ES7000 chipsets - */ - -#define NON_UNISYS 0 -#define ES7000_CLASSIC 1 -#define ES7000_ZORRO 2 - -#define MIP_REG 1 -#define MIP_PSAI_REG 4 - -#define MIP_BUSY 1 -#define MIP_SPIN 0xf0000 -#define MIP_VALID 0x0100000000000000ULL -#define MIP_SW_APIC 0x1020b - -#define MIP_PORT(val) ((val >> 32) & 0xffff) - -#define MIP_RD_LO(val) (val & 0xffffffff) - -struct mip_reg { - unsigned long long off_0x00; - unsigned long long off_0x08; - unsigned long long off_0x10; - unsigned long long off_0x18; - unsigned long long off_0x20; - unsigned long long off_0x28; - unsigned long long off_0x30; - unsigned long long off_0x38; -}; - -struct mip_reg_info { - unsigned long long mip_info; - unsigned long long delivery_info; - unsigned long long host_reg; - unsigned long long mip_reg; -}; - -struct psai { - unsigned long long entry_type; - unsigned long long addr; - unsigned long long bep_addr; -}; - -#ifdef CONFIG_ACPI - -struct es7000_oem_table { - struct acpi_table_header Header; - u32 OEMTableAddr; - u32 OEMTableSize; -}; - -static unsigned long oem_addrX; -static unsigned long oem_size; - -#endif - -/* - * ES7000 Globals - */ - -static volatile unsigned long *psai; -static struct mip_reg *mip_reg; -static struct mip_reg *host_reg; -static int mip_port; -static unsigned long mip_addr; -static unsigned long host_addr; - -int es7000_plat; - -/* - * GSI override for ES7000 platforms. - */ - - -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; -} - -static int es7000_apic_is_cluster(void) -{ - /* MPENTIUMIII */ - if (boot_cpu_data.x86 == 6 && - (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) - return 1; - - return 0; -} - -static void setup_unisys(void) -{ - /* - * Determine the generation of the ES7000 currently running. - * - * es7000_plat = 1 if the machine is a 5xx ES7000 box - * es7000_plat = 2 if the machine is a x86_64 ES7000 box - * - */ - if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) - es7000_plat = ES7000_ZORRO; - else - es7000_plat = ES7000_CLASSIC; -} - -/* - * Parse the OEM Table: - */ -static int parse_unisys_oem(char *oemptr) -{ - int i; - int success = 0; - unsigned char type, size; - unsigned long val; - char *tp = NULL; - struct psai *psaip = NULL; - struct mip_reg_info *mi; - struct mip_reg *host, *mip; - - tp = oemptr; - - tp += 8; - - for (i = 0; i <= 6; i++) { - type = *tp++; - size = *tp++; - tp -= 2; - switch (type) { - case MIP_REG: - mi = (struct mip_reg_info *)tp; - val = MIP_RD_LO(mi->host_reg); - host_addr = val; - host = (struct mip_reg *)val; - host_reg = __va(host); - val = MIP_RD_LO(mi->mip_reg); - mip_port = MIP_PORT(mi->mip_info); - mip_addr = val; - mip = (struct mip_reg *)val; - mip_reg = __va(mip); - pr_debug("host_reg = 0x%lx\n", - (unsigned long)host_reg); - pr_debug("mip_reg = 0x%lx\n", - (unsigned long)mip_reg); - success++; - break; - case MIP_PSAI_REG: - psaip = (struct psai *)tp; - if (tp != NULL) { - if (psaip->addr) - psai = __va(psaip->addr); - else - psai = NULL; - success++; - } - break; - default: - break; - } - tp += size; - } - - if (success < 2) - es7000_plat = NON_UNISYS; - else - setup_unisys(); - - return es7000_plat; -} - -#ifdef CONFIG_ACPI -static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) -{ - struct acpi_table_header *header = NULL; - struct es7000_oem_table *table; - acpi_size tbl_size; - acpi_status ret; - int i = 0; - - for (;;) { - ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size); - if (!ACPI_SUCCESS(ret)) - return -1; - - if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) - break; - - early_acpi_os_unmap_memory(header, tbl_size); - } - - table = (void *)header; - - oem_addrX = table->OEMTableAddr; - oem_size = table->OEMTableSize; - - early_acpi_os_unmap_memory(header, tbl_size); - - *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size); - - return 0; -} - -static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) -{ - if (!oem_addr) - return; - - __acpi_unmap_table((char *)oem_addr, oem_size); -} - -static int es7000_check_dsdt(void) -{ - struct acpi_table_header header; - - if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) && - !strncmp(header.oem_id, "UNISYS", 6)) - return 1; - return 0; -} - -static int es7000_acpi_ret; - -/* Hook from generic ACPI tables.c */ -static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - unsigned long oem_addr = 0; - int check_dsdt; - int ret = 0; - - /* check dsdt at first to avoid clear fix_map for oem_addr */ - check_dsdt = es7000_check_dsdt(); - - if (!find_unisys_acpi_oem_table(&oem_addr)) { - if (check_dsdt) { - ret = parse_unisys_oem((char *)oem_addr); - } else { - setup_unisys(); - ret = 1; - } - /* - * we need to unmap it - */ - unmap_unisys_acpi_oem_table(oem_addr); - } - - es7000_acpi_ret = ret; - - return ret && !es7000_apic_is_cluster(); -} - -static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) -{ - int ret = es7000_acpi_ret; - - return ret && es7000_apic_is_cluster(); -} - -#else /* !CONFIG_ACPI: */ -static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - return 0; -} - -static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) -{ - return 0; -} -#endif /* !CONFIG_ACPI */ - -static void es7000_spin(int n) -{ - int i = 0; - - while (i++ < n) - rep_nop(); -} - -static int es7000_mip_write(struct mip_reg *mip_reg) -{ - int status = 0; - int spin; - - spin = MIP_SPIN; - while ((host_reg->off_0x38 & MIP_VALID) != 0) { - if (--spin <= 0) { - WARN(1, "Timeout waiting for Host Valid Flag\n"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); - outb(1, mip_port); - - spin = MIP_SPIN; - - while ((mip_reg->off_0x38 & MIP_VALID) == 0) { - if (--spin <= 0) { - WARN(1, "Timeout waiting for MIP Valid Flag\n"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48; - mip_reg->off_0x38 &= ~MIP_VALID; - - return status; -} - -static void es7000_enable_apic_mode(void) -{ - struct mip_reg es7000_mip_reg; - int mip_status; - - if (!es7000_plat) - return; - - pr_info("Enabling APIC mode.\n"); - memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); - es7000_mip_reg.off_0x00 = MIP_SW_APIC; - es7000_mip_reg.off_0x38 = MIP_VALID; - - while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) - WARN(1, "Command failed, status = %x\n", mip_status); -} - -static void es7000_wait_for_init_deassert(atomic_t *deassert) -{ - while (!atomic_read(deassert)) - cpu_relax(); -} - -static unsigned int es7000_get_apic_id(unsigned long x) -{ - return (x >> 24) & 0xFF; -} - -static void es7000_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_phys(mask, vector); -} - -static void es7000_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void es7000_send_IPI_all(int vector) -{ - es7000_send_IPI_mask(cpu_online_mask, vector); -} - -static int es7000_apic_id_registered(void) -{ - return 1; -} - -static const struct cpumask *target_cpus_cluster(void) -{ - return cpu_all_mask; -} - -static const struct cpumask *es7000_target_cpus(void) -{ - return cpumask_of(smp_processor_id()); -} - -static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) -{ - return 0; -} - -static unsigned long es7000_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - -static int es7000_early_logical_apicid(int cpu) -{ - /* on es7000, logical apicid is the same as physical */ - return early_per_cpu(x86_bios_cpu_apicid, cpu); -} - -static unsigned long calculate_ldr(int cpu) -{ - unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); - - return SET_APIC_LOGICAL_ID(id); -} - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LdR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void es7000_init_apic_ldr_cluster(void) -{ - unsigned long val; - int cpu = smp_processor_id(); - - apic_write(APIC_DFR, APIC_DFR_CLUSTER); - val = calculate_ldr(cpu); - apic_write(APIC_LDR, val); -} - -static void es7000_init_apic_ldr(void) -{ - unsigned long val; - int cpu = smp_processor_id(); - - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = calculate_ldr(cpu); - apic_write(APIC_LDR, val); -} - -static void es7000_setup_apic_routing(void) -{ - int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - - pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", - (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", - nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); -} - -static int es7000_cpu_present_to_apicid(int mps_cpu) -{ - if (!mps_cpu) - return boot_cpu_physical_apicid; - else if (mps_cpu < nr_cpu_ids) - return per_cpu(x86_bios_cpu_apicid, mps_cpu); - else - return BAD_APICID; -} - -static int cpu_id; - -static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) -{ - physid_set_mask_of_physid(cpu_id, retmap); - ++cpu_id; -} - -static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - /* For clustered we don't have a good way to do this yet - hack */ - physids_promote(0xFFL, retmap); -} - -static int es7000_check_phys_apicid_present(int cpu_physical_apicid) -{ - boot_cpu_physical_apicid = read_apic_id(); - return 1; -} - -static inline int -es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) -{ - unsigned int round = 0; - unsigned int cpu, uninitialized_var(apicid); - - /* - * The cpus in the mask must all be on the apic cluster. - */ - for_each_cpu_and(cpu, cpumask, cpu_online_mask) { - int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - - if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - WARN(1, "Not a valid mask!"); - - return -EINVAL; - } - apicid |= new_apicid; - round++; - } - if (!round) - return -EINVAL; - *dest_id = apicid; - return 0; -} - -static int -es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - cpumask_var_t cpumask; - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); - - if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return 0; - - cpumask_and(cpumask, inmask, andmask); - es7000_cpu_mask_to_apicid(cpumask, apicid); - - free_cpumask_var(cpumask); - - return 0; -} - -static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - -static int probe_es7000(void) -{ - /* probed later in mptable/ACPI hooks */ - return 0; -} - -static int es7000_mps_ret; -static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem, - char *productid) -{ - int ret = 0; - - if (mpc->oemptr) { - struct mpc_oemtable *oem_table = - (struct mpc_oemtable *)mpc->oemptr; - - if (!strncmp(oem, "UNISYS", 6)) - ret = parse_unisys_oem((char *)oem_table); - } - - es7000_mps_ret = ret; - - return ret && !es7000_apic_is_cluster(); -} - -static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, - char *productid) -{ - int ret = es7000_mps_ret; - - return ret && es7000_apic_is_cluster(); -} - -/* We've been warned by a false positive warning.Use __refdata to keep calm. */ -static struct apic __refdata apic_es7000_cluster = { - - .name = "es7000", - .probe = probe_es7000, - .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = es7000_apic_id_registered, - - .irq_delivery_mode = dest_LowestPrio, - /* logical delivery broadcast to all procs: */ - .irq_dest_mode = 1, - - .target_cpus = target_cpus_cluster, - .disable_esr = 1, - .dest_logical = 0, - .check_apicid_used = es7000_check_apicid_used, - .check_apicid_present = es7000_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = es7000_init_apic_ldr_cluster, - - .ioapic_phys_id_map = es7000_ioapic_phys_id_map, - .setup_apic_routing = es7000_setup_apic_routing, - .multi_timer_check = NULL, - .cpu_present_to_apicid = es7000_cpu_present_to_apicid, - .apicid_to_cpu_present = es7000_apicid_to_cpu_present, - .setup_portio_remap = NULL, - .check_phys_apicid_present = es7000_check_phys_apicid_present, - .enable_apic_mode = es7000_enable_apic_mode, - .phys_pkg_id = es7000_phys_pkg_id, - .mps_oem_check = es7000_mps_oem_check_cluster, - - .get_apic_id = es7000_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, - - .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, - - .send_IPI_mask = es7000_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = es7000_send_IPI_allbutself, - .send_IPI_all = es7000_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip, - - .trampoline_phys_low = 0x467, - .trampoline_phys_high = 0x469, - - .wait_for_init_deassert = NULL, - - /* Nothing to do for most platforms, since cleared by the INIT cycle: */ - .smp_callin_clear_local_apic = NULL, - .inquire_remote_apic = default_inquire_remote_apic, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = es7000_early_logical_apicid, -}; - -static struct apic __refdata apic_es7000 = { - - .name = "es7000", - .probe = probe_es7000, - .acpi_madt_oem_check = es7000_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = es7000_apic_id_registered, - - .irq_delivery_mode = dest_Fixed, - /* phys delivery to target CPUs: */ - .irq_dest_mode = 0, - - .target_cpus = es7000_target_cpus, - .disable_esr = 1, - .dest_logical = 0, - .check_apicid_used = es7000_check_apicid_used, - .check_apicid_present = es7000_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = es7000_init_apic_ldr, - - .ioapic_phys_id_map = es7000_ioapic_phys_id_map, - .setup_apic_routing = es7000_setup_apic_routing, - .multi_timer_check = NULL, - .cpu_present_to_apicid = es7000_cpu_present_to_apicid, - .apicid_to_cpu_present = es7000_apicid_to_cpu_present, - .setup_portio_remap = NULL, - .check_phys_apicid_present = es7000_check_phys_apicid_present, - .enable_apic_mode = es7000_enable_apic_mode, - .phys_pkg_id = es7000_phys_pkg_id, - .mps_oem_check = es7000_mps_oem_check, - - .get_apic_id = es7000_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, - - .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, - - .send_IPI_mask = es7000_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = es7000_send_IPI_allbutself, - .send_IPI_all = es7000_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .trampoline_phys_low = 0x467, - .trampoline_phys_high = 0x469, - - .wait_for_init_deassert = es7000_wait_for_init_deassert, - - /* Nothing to do for most platforms, since cleared by the INIT cycle: */ - .smp_callin_clear_local_apic = NULL, - .inquire_remote_apic = default_inquire_remote_apic, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = es7000_early_logical_apicid, -}; - -/* - * Need to check for es7000 followed by es7000_cluster, so this order - * in apic_drivers is important. - */ -apic_drivers(apic_es7000, apic_es7000_cluster); diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c deleted file mode 100644 index 1e42e8f..0000000 --- a/arch/x86/kernel/apic/numaq_32.c +++ /dev/null @@ -1,525 +0,0 @@ -/* - * Written by: Patricia Gaughen, IBM Corporation - * - * Copyright (C) 2002, IBM Corp. - * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <gone@us.ibm.com> - */ -#include <linux/nodemask.h> -#include <linux/topology.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/kernel.h> -#include <linux/mmzone.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/numa.h> -#include <linux/smp.h> -#include <linux/io.h> -#include <linux/mm.h> - -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/numaq.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/e820.h> -#include <asm/ipi.h> - -int found_numaq; - -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_trans { - unsigned char mpc_type; - unsigned char trans_len; - unsigned char trans_type; - unsigned char trans_quad; - unsigned char trans_global; - unsigned char trans_local; - unsigned short trans_reserved; -}; - -static int mpc_record; - -static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; - -int mp_bus_id_to_node[MAX_MP_BUSSES]; -int mp_bus_id_to_local[MAX_MP_BUSSES]; -int quad_local_to_mp_bus_id[NR_CPUS/4][4]; - - -static inline void numaq_register_node(int node, struct sys_cfg_data *scd) -{ - struct eachquadmem *eq = scd->eq + node; - u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; - u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; - int ret; - - node_set(node, numa_nodes_parsed); - ret = numa_add_memblk(node, start, end); - BUG_ON(ret < 0); -} - -/* - * Function: smp_dump_qct() - * - * Description: gets memory layout from the quad config table. This - * function also updates numa_nodes_parsed with the nodes (quads) present. - */ -static void __init smp_dump_qct(void) -{ - struct sys_cfg_data *scd; - int node; - - scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - - for_each_node(node) { - if (scd->quads_present31_0 & (1 << node)) - numaq_register_node(node, scd); - } -} - -void numaq_tsc_disable(void) -{ - if (!found_numaq) - return; - - if (num_online_nodes() > 1) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - setup_clear_cpu_cap(X86_FEATURE_TSC); - } -} - -static void __init numaq_tsc_init(void) -{ - numaq_tsc_disable(); -} - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ - return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - -/* x86_quirks member */ -static int mpc_apic_id(struct mpc_cpu *m) -{ - int quad = translation_table[mpc_record]->trans_quad; - int logical_apicid = generate_logical_apicid(quad, m->apicid); - - printk(KERN_DEBUG - "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", - m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, - (m->cpufeature & CPU_MODEL_MASK) >> 4, - m->apicver, quad, logical_apicid); - - return logical_apicid; -} - -/* x86_quirks member */ -static void mpc_oem_bus_info(struct mpc_bus *m, char *name) -{ - int quad = translation_table[mpc_record]->trans_quad; - int local = translation_table[mpc_record]->trans_local; - - mp_bus_id_to_node[m->busid] = quad; - mp_bus_id_to_local[m->busid] = local; - - printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad); -} - -/* x86_quirks member */ -static void mpc_oem_pci_bus(struct mpc_bus *m) -{ - int quad = translation_table[mpc_record]->trans_quad; - int local = translation_table[mpc_record]->trans_local; - - quad_local_to_mp_bus_id[quad][local] = m->busid; -} - -/* - * Called from mpparse code. - * mode = 0: prescan - * mode = 1: one mpc entry scanned - */ -static void numaq_mpc_record(unsigned int mode) -{ - if (!mode) - mpc_record = 0; - else - mpc_record++; -} - -static void __init MP_translation_info(struct mpc_trans *m) -{ - printk(KERN_INFO - "Translation: record %d, type %d, quad %d, global %d, local %d\n", - mpc_record, m->trans_type, m->trans_quad, m->trans_global, - m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -/* - * Read/parse the MPC oem tables - */ -static void __init smp_read_mpc_oem(struct mpc_table *mpc) -{ - struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; - int count = sizeof(*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable) + count; - - mpc_record = 0; - printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it...\n", oemtable); - - if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { - printk(KERN_WARNING - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->signature[0], oemtable->signature[1], - oemtable->signature[2], oemtable->signature[3]); - return; - } - - if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - - while (count < oemtable->length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_trans *m = (void *)oemptr; - - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - printk(KERN_WARNING - "Unrecognised OEM table entry type! - %d\n", - (int)*oemptr); - return; - } - } -} - -static __init void early_check_numaq(void) -{ - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); - - if (found_numaq) { - x86_init.mpparse.mpc_record = numaq_mpc_record; - x86_init.mpparse.setup_ioapic_ids = x86_init_noop; - x86_init.mpparse.mpc_apic_id = mpc_apic_id; - x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; - x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; - x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; - x86_init.timers.tsc_pre_init = numaq_tsc_init; - x86_init.pci.init = pci_numaq_init; - } -} - -int __init numaq_numa_init(void) -{ - early_check_numaq(); - if (!found_numaq) - return -ENOENT; - smp_dump_qct(); - - return 0; -} - -#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) - -static inline unsigned int numaq_get_apic_id(unsigned long x) -{ - return (x >> 24) & 0x0F; -} - -static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_logical(mask, vector); -} - -static inline void numaq_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); -} - -static inline void numaq_send_IPI_all(int vector) -{ - numaq_send_IPI_mask(cpu_online_mask, vector); -} - -#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8) -#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa) - -/* - * Because we use NMIs rather than the INIT-STARTUP sequence to - * bootstrap the CPUs, the APIC may be in a weird state. Kick it: - */ -static inline void numaq_smp_callin_clear_local_apic(void) -{ - clear_local_APIC(); -} - -static inline const struct cpumask *numaq_target_cpus(void) -{ - return cpu_all_mask; -} - -static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) -{ - return physid_isset(apicid, *map); -} - -static inline unsigned long numaq_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - -static inline int numaq_apic_id_registered(void) -{ - return 1; -} - -static inline void numaq_init_apic_ldr(void) -{ - /* Already done in NUMA-Q firmware */ -} - -static inline void numaq_setup_apic_routing(void) -{ - printk(KERN_INFO - "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n", - nr_ioapics); -} - -/* - * Skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum. - */ -static inline int numaq_multi_timer_check(int apic, int irq) -{ - return apic != 0 && irq == 0; -} - -static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - /* We don't have a good way to do this yet - hack */ - return physids_promote(0xFUL, retmap); -} - -/* - * Supporting over 60 cpus on NUMA-Q requires a locality-dependent - * cpu to APIC ID relation to properly interact with the intelligent - * mode of the cluster controller. - */ -static inline int numaq_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < 60) - return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); - else - return BAD_APICID; -} - -static inline int numaq_apicid_to_node(int logical_apicid) -{ - return logical_apicid >> 4; -} - -static int numaq_numa_cpu_node(int cpu) -{ - int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - - if (logical_apicid != BAD_APICID) - return numaq_apicid_to_node(logical_apicid); - return NUMA_NO_NODE; -} - -static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) -{ - int node = numaq_apicid_to_node(logical_apicid); - int cpu = __ffs(logical_apicid & 0xf); - - physid_set_mask_of_physid(cpu + 4*node, retmap); -} - -/* Where the IO area was mapped on multiquad, always 0 otherwise */ -void *xquad_portio; - -static inline int numaq_check_phys_apicid_present(int phys_apicid) -{ - return 1; -} - -/* - * We use physical apicids here, not logical, so just return the default - * physical broadcast to stop people from breaking us - */ -static int -numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - *apicid = 0x0F; - return 0; -} - -/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ -static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - -static int -numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk(KERN_ERR "Warning! Not a NUMA-Q system!\n"); - else - found_numaq = 1; - - return found_numaq; -} - -static int probe_numaq(void) -{ - /* already know from get_memcfg_numaq() */ - return found_numaq; -} - -static void numaq_setup_portio_remap(void) -{ - int num_quads = num_online_nodes(); - - if (num_quads <= 1) - return; - - printk(KERN_INFO - "Remapping cross-quad port I/O for %d quads\n", num_quads); - - xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD); - - printk(KERN_INFO - "xquad_portio vaddr 0x%08lx, len %08lx\n", - (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); -} - -/* Use __refdata to keep false positive warning calm. */ -static struct apic __refdata apic_numaq = { - - .name = "NUMAQ", - .probe = probe_numaq, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = numaq_apic_id_registered, - - .irq_delivery_mode = dest_LowestPrio, - /* physical delivery on LOCAL quad: */ - .irq_dest_mode = 0, - - .target_cpus = numaq_target_cpus, - .disable_esr = 1, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = numaq_check_apicid_used, - .check_apicid_present = numaq_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = numaq_init_apic_ldr, - - .ioapic_phys_id_map = numaq_ioapic_phys_id_map, - .setup_apic_routing = numaq_setup_apic_routing, - .multi_timer_check = numaq_multi_timer_check, - .cpu_present_to_apicid = numaq_cpu_present_to_apicid, - .apicid_to_cpu_present = numaq_apicid_to_cpu_present, - .setup_portio_remap = numaq_setup_portio_remap, - .check_phys_apicid_present = numaq_check_phys_apicid_present, - .enable_apic_mode = NULL, - .phys_pkg_id = numaq_phys_pkg_id, - .mps_oem_check = numaq_mps_oem_check, - - .get_apic_id = numaq_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, - - .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, - - .send_IPI_mask = numaq_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = numaq_send_IPI_allbutself, - .send_IPI_all = numaq_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi, - .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH, - - /* We don't do anything here because we use NMI's to boot instead */ - .wait_for_init_deassert = NULL, - - .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic, - .inquire_remote_apic = NULL, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = numaq_numa_cpu_node, -}; - -apic_driver(apic_numaq); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index eb35ef9..cceb352 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -119,8 +119,7 @@ static struct apic apic_default = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = default_wait_for_init_deassert, - + .wait_for_init_deassert = true, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c deleted file mode 100644 index 00146f9..0000000 --- a/arch/x86/kernel/apic/summit_32.c +++ /dev/null @@ -1,551 +0,0 @@ -/* - * IBM Summit-Specific Code - * - * Written By: Matthew Dobson, IBM Corporation - * - * Copyright (c) 2003 IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <colpatch@us.ibm.com> - * - */ - -#define pr_fmt(fmt) "summit: %s: " fmt, __func__ - -#include <linux/mm.h> -#include <asm/io.h> -#include <asm/bios_ebda.h> - -/* - * APIC driver for the IBM "Summit" chipset. - */ -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <asm/mpspec.h> -#include <asm/apic.h> -#include <asm/smp.h> -#include <asm/fixmap.h> -#include <asm/apicdef.h> -#include <asm/ipi.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/gfp.h> -#include <linux/smp.h> - -static unsigned summit_get_apic_id(unsigned long x) -{ - return (x >> 24) & 0xFF; -} - -static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_logical(mask, vector); -} - -static void summit_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); -} - -static void summit_send_IPI_all(int vector) -{ - summit_send_IPI_mask(cpu_online_mask, vector); -} - -#include <asm/tsc.h> - -extern int use_cyclone; - -#ifdef CONFIG_X86_SUMMIT_NUMA -static void setup_summit(void); -#else -static inline void setup_summit(void) {} -#endif - -static int summit_mps_oem_check(struct mpc_table *mpc, char *oem, - char *productid) -{ - if (!strncmp(oem, "IBM ENSW", 8) && - (!strncmp(productid, "VIGIL SMP", 9) - || !strncmp(productid, "EXA", 3) - || !strncmp(productid, "RUTHLESS SMP", 12))){ - mark_tsc_unstable("Summit based system"); - use_cyclone = 1; /*enable cyclone-timer*/ - setup_summit(); - return 1; - } - return 0; -} - -/* Hook from generic ACPI tables.c */ -static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - if (!strncmp(oem_id, "IBM", 3) && - (!strncmp(oem_table_id, "SERVIGIL", 8) - || !strncmp(oem_table_id, "EXA", 3))){ - mark_tsc_unstable("Summit based system"); - use_cyclone = 1; /*enable cyclone-timer*/ - setup_summit(); - return 1; - } - return 0; -} - -struct rio_table_hdr { - unsigned char version; /* Version number of this data structure */ - /* Version 3 adds chassis_num & WP_index */ - unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ - unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ -} __attribute__((packed)); - -struct scal_detail { - unsigned char node_id; /* Scalability Node ID */ - unsigned long CBAR; /* Address of 1MB register space */ - unsigned char port0node; /* Node ID port connected to: 0xFF=None */ - unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char port1node; /* Node ID port connected to: 0xFF = None */ - unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char port2node; /* Node ID port connected to: 0xFF = None */ - unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ -} __attribute__((packed)); - -struct rio_detail { - unsigned char node_id; /* RIO Node ID */ - unsigned long BBAR; /* Address of 1MB register space */ - unsigned char type; /* Type of device */ - unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ - /* For CYC: Node ID of Twister that owns this CYC */ - unsigned char port0node; /* Node ID port connected to: 0xFF=None */ - unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char port1node; /* Node ID port connected to: 0xFF=None */ - unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ - unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ - /* For CYC: 0 */ - unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ - /* = 0 : the XAPIC is not used, ie:*/ - /* ints fwded to another XAPIC */ - /* Bits1:7 Reserved */ - /* For CYC: Bits0:7 Reserved */ - unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ - /* lower slot numbers/PCI bus numbers */ - /* For CYC: No meaning */ - unsigned char chassis_num; /* 1 based Chassis number */ - /* For LookOut WPEGs this field indicates the */ - /* Expansion Chassis #, enumerated from Boot */ - /* Node WPEG external port, then Boot Node CYC */ - /* external port, then Next Vigil chassis WPEG */ - /* external port, etc. */ - /* Shared Lookouts have only 1 chassis number (the */ - /* first one assigned) */ -} __attribute__((packed)); - - -typedef enum { - CompatTwister = 0, /* Compatibility Twister */ - AltTwister = 1, /* Alternate Twister of internal 8-way */ - CompatCyclone = 2, /* Compatibility Cyclone */ - AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ - CompatWPEG = 4, /* Compatibility WPEG */ - AltWPEG = 5, /* Second Planar WPEG */ - LookOutAWPEG = 6, /* LookOut WPEG */ - LookOutBWPEG = 7, /* LookOut WPEG */ -} node_type; - -static inline int is_WPEG(struct rio_detail *rio){ - return (rio->type == CompatWPEG || rio->type == AltWPEG || - rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); -} - -#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) - -static const struct cpumask *summit_target_cpus(void) -{ - /* CPU_MASK_ALL (0xff) has undefined behaviour with - * dest_LowestPrio mode logical clustered apic interrupt routing - * Just start on cpu 0. IRQ balancing will spread load - */ - return cpumask_of(0); -} - -static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) -{ - return 0; -} - -/* we don't use the phys_cpu_present_map to indicate apicid presence */ -static unsigned long summit_check_apicid_present(int bit) -{ - return 1; -} - -static int summit_early_logical_apicid(int cpu) -{ - int count = 0; - u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); - u8 my_cluster = APIC_CLUSTER(my_id); -#ifdef CONFIG_SMP - u8 lid; - int i; - - /* Create logical APIC IDs by counting CPUs already in cluster. */ - for (count = 0, i = nr_cpu_ids; --i >= 0; ) { - lid = early_per_cpu(x86_cpu_to_logical_apicid, i); - if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) - ++count; - } -#endif - /* We only have a 4 wide bitmap in cluster mode. If a deranged - * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ - BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); - return my_cluster | (1UL << count); -} - -static void summit_init_apic_ldr(void) -{ - int cpu = smp_processor_id(); - unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - unsigned long val; - - apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - -static int summit_apic_id_registered(void) -{ - return 1; -} - -static void summit_setup_apic_routing(void) -{ - pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", - nr_ioapics); -} - -static int summit_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < nr_cpu_ids) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); - else - return BAD_APICID; -} - -static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) -{ - /* For clustered we don't have a good way to do this yet - hack */ - physids_promote(0x0FL, retmap); -} - -static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) -{ - physid_set_mask_of_physid(0, retmap); -} - -static int summit_check_phys_apicid_present(int physical_apicid) -{ - return 1; -} - -static inline int -summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) -{ - unsigned int round = 0; - unsigned int cpu, apicid = 0; - - /* - * The cpus in the mask must all be on the apic cluster. - */ - for_each_cpu_and(cpu, cpumask, cpu_online_mask) { - int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - - if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - pr_err("Not a valid mask!\n"); - return -EINVAL; - } - apicid |= new_apicid; - round++; - } - if (!round) - return -EINVAL; - *dest_id = apicid; - return 0; -} - -static int -summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - cpumask_var_t cpumask; - *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); - - if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return 0; - - cpumask_and(cpumask, inmask, andmask); - summit_cpu_mask_to_apicid(cpumask, apicid); - - free_cpumask_var(cpumask); - - return 0; -} - -/* - * cpuid returns the value latched in the HW at reset, not the APIC ID - * register's value. For any box whose BIOS changes APIC IDs, like - * clustered APIC systems, we must use hard_smp_processor_id. - * - * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. - */ -static int summit_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - -static int probe_summit(void) -{ - /* probed later in mptable/ACPI hooks */ - return 0; -} - -#ifdef CONFIG_X86_SUMMIT_NUMA -static struct rio_table_hdr *rio_table_hdr; -static struct scal_detail *scal_devs[MAX_NUMNODES]; -static struct rio_detail *rio_devs[MAX_NUMNODES*4]; - -#ifndef CONFIG_X86_NUMAQ -static int mp_bus_id_to_node[MAX_MP_BUSSES]; -#endif - -static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) -{ - int twister = 0, node = 0; - int i, bus, num_buses; - - for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { - if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) { - twister = rio_devs[i]->owner_id; - break; - } - } - if (i == rio_table_hdr->num_rio_dev) { - pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); - return last_bus; - } - - for (i = 0; i < rio_table_hdr->num_scal_dev; i++) { - if (scal_devs[i]->node_id == twister) { - node = scal_devs[i]->node_id; - break; - } - } - if (i == rio_table_hdr->num_scal_dev) { - pr_err("Couldn't find owner Twister for Cyclone!\n"); - return last_bus; - } - - switch (rio_devs[wpeg_num]->type) { - case CompatWPEG: - /* - * The Compatibility Winnipeg controls the 2 legacy buses, - * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case - * a PCI-PCI bridge card is used in either slot: total 5 buses. - */ - num_buses = 5; - break; - case AltWPEG: - /* - * The Alternate Winnipeg controls the 2 133MHz buses [1 slot - * each], their 2 "extra" buses, the 100MHz bus [2 slots] and - * the "extra" buses for each of those slots: total 7 buses. - */ - num_buses = 7; - break; - case LookOutAWPEG: - case LookOutBWPEG: - /* - * A Lookout Winnipeg controls 3 100MHz buses [2 slots each] - * & the "extra" buses for each of those slots: total 9 buses. - */ - num_buses = 9; - break; - default: - pr_info("Unsupported Winnipeg type!\n"); - return last_bus; - } - - for (bus = last_bus; bus < last_bus + num_buses; bus++) - mp_bus_id_to_node[bus] = node; - return bus; -} - -static int build_detail_arrays(void) -{ - unsigned long ptr; - int i, scal_detail_size, rio_detail_size; - - if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { - pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", - MAX_NUMNODES, rio_table_hdr->num_scal_dev); - return 0; - } - - switch (rio_table_hdr->version) { - default: - pr_warn("Invalid Rio Grande Table Version: %d\n", - rio_table_hdr->version); - return 0; - case 2: - scal_detail_size = 11; - rio_detail_size = 13; - break; - case 3: - scal_detail_size = 12; - rio_detail_size = 15; - break; - } - - ptr = (unsigned long)rio_table_hdr + 3; - for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) - scal_devs[i] = (struct scal_detail *)ptr; - - for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) - rio_devs[i] = (struct rio_detail *)ptr; - - return 1; -} - -void setup_summit(void) -{ - unsigned long ptr; - unsigned short offset; - int i, next_wpeg, next_bus = 0; - - /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ - ptr = get_bios_ebda(); - ptr = (unsigned long)phys_to_virt(ptr); - - rio_table_hdr = NULL; - offset = 0x180; - while (offset) { - /* The block id is stored in the 2nd word */ - if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) { - /* set the pointer past the offset & block id */ - rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); - break; - } - /* The next offset is stored in the 1st word. 0 means no more */ - offset = *((unsigned short *)(ptr + offset)); - } - if (!rio_table_hdr) { - pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); - return; - } - - if (!build_detail_arrays()) - return; - - /* The first Winnipeg we're looking for has an index of 0 */ - next_wpeg = 0; - do { - for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { - if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) { - /* It's the Winnipeg we're looking for! */ - next_bus = setup_pci_node_map_for_wpeg(i, next_bus); - next_wpeg++; - break; - } - } - /* - * If we go through all Rio devices and don't find one with - * the next index, it means we've found all the Winnipegs, - * and thus all the PCI buses. - */ - if (i == rio_table_hdr->num_rio_dev) - next_wpeg = 0; - } while (next_wpeg != 0); -} -#endif - -static struct apic apic_summit = { - - .name = "summit", - .probe = probe_summit, - .acpi_madt_oem_check = summit_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = summit_apic_id_registered, - - .irq_delivery_mode = dest_LowestPrio, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, - - .target_cpus = summit_target_cpus, - .disable_esr = 1, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = summit_check_apicid_used, - .check_apicid_present = summit_check_apicid_present, - - .vector_allocation_domain = flat_vector_allocation_domain, - .init_apic_ldr = summit_init_apic_ldr, - - .ioapic_phys_id_map = summit_ioapic_phys_id_map, - .setup_apic_routing = summit_setup_apic_routing, - .multi_timer_check = NULL, - .cpu_present_to_apicid = summit_cpu_present_to_apicid, - .apicid_to_cpu_present = summit_apicid_to_cpu_present, - .setup_portio_remap = NULL, - .check_phys_apicid_present = summit_check_phys_apicid_present, - .enable_apic_mode = NULL, - .phys_pkg_id = summit_phys_pkg_id, - .mps_oem_check = summit_mps_oem_check, - - .get_apic_id = summit_get_apic_id, - .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, - - .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, - - .send_IPI_mask = summit_send_IPI_mask, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = summit_send_IPI_allbutself, - .send_IPI_all = summit_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - - .wait_for_init_deassert = default_wait_for_init_deassert, - - .smp_callin_clear_local_apic = NULL, - .inquire_remote_apic = default_inquire_remote_apic, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = summit_early_logical_apicid, -}; - -apic_driver(apic_summit); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cac85ee..e66766b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -279,7 +279,7 @@ static struct apic apic_x2apic_cluster = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index de231e3..6d600eb 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -133,7 +133,7 @@ static struct apic apic_x2apic_phys = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d263b13..7834389 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -396,7 +396,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .wakeup_secondary_cpu = uv_wakeup_secondary, .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c67ffa6..ce8b8ff 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -218,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c) */ WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); - add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); } static void init_amd_k7(struct cpuinfo_x86 *c) @@ -233,9 +233,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c) if (c->x86_model >= 6 && c->x86_model <= 10) { if (!cpu_has(c, X86_FEATURE_XMM)) { printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - rdmsr(MSR_K7_HWCR, l, h); - l &= ~0x00008000; - wrmsr(MSR_K7_HWCR, l, h); + msr_clear_bit(MSR_K7_HWCR, 15); set_cpu_cap(c, X86_FEATURE_XMM); } } @@ -509,14 +507,8 @@ static void early_init_amd(struct cpuinfo_x86 *c) #endif /* F16h erratum 793, CVE-2013-6885 */ - if (c->x86 == 0x16 && c->x86_model <= 0xf) { - u64 val; - - rdmsrl(MSR_AMD64_LS_CFG, val); - if (!(val & BIT(15))) - wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15)); - } - + if (c->x86 == 0x16 && c->x86_model <= 0xf) + msr_set_bit(MSR_AMD64_LS_CFG, 15); } static const int amd_erratum_383[]; @@ -536,11 +528,8 @@ static void init_amd(struct cpuinfo_x86 *c) * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ - if (c->x86 == 0xf) { - rdmsrl(MSR_K7_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K7_HWCR, value); - } + if (c->x86 == 0xf) + msr_set_bit(MSR_K7_HWCR, 6); #endif early_init_amd(c); @@ -623,14 +612,11 @@ static void init_amd(struct cpuinfo_x86 *c) (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { - if (!rdmsrl_safe(0xc0011005, &value)) { - value |= 1ULL << 54; - wrmsrl_safe(0xc0011005, value); + if (msr_set_bit(0xc0011005, 54) > 0) { rdmsrl(0xc0011005, value); - if (value & (1ULL << 54)) { + if (value & BIT_64(54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); - printk(KERN_INFO FW_INFO "CPU: Re-enabling " - "disabled Topology Extensions Support\n"); + pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } } } @@ -709,19 +695,12 @@ static void init_amd(struct cpuinfo_x86 *c) * Disable GART TLB Walk Errors on Fam10h. We do this here * because this is always needed when GART is enabled, even in a * kernel which has no MCE support built in. - * BIOS should disable GartTlbWlk Errors themself. If - * it doesn't do it here as suggested by the BKDG. + * BIOS should disable GartTlbWlk Errors already. If + * it doesn't, do it here as suggested by the BKDG. * * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 */ - u64 mask; - int err; - - err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); - if (err == 0) { - mask |= (1 << 10); - wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); - } + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); /* * On family 10h BIOS may not have properly enabled WC+ support, @@ -733,10 +712,7 @@ static void init_amd(struct cpuinfo_x86 *c) * NOTE: we want to use the _safe accessors so as not to #GP kvm * guests on older kvm hosts. */ - - rdmsrl_safe(MSR_AMD64_BU_CFG2, &value); - value &= ~(1ULL << 24); - wrmsrl_safe(MSR_AMD64_BU_CFG2, value); + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); if (cpu_has_amd_erratum(c, amd_erratum_383)) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8e28bf2..a135239 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1025,7 +1025,8 @@ __setup("show_msr=", setup_show_msr); static __init int setup_noclflush(char *arg) { - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); return 1; } __setup("noclflush", setup_noclflush); @@ -1078,6 +1079,10 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); +DEFINE_PER_CPU(unsigned long, kernel_stack) = + (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); + #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, @@ -1094,10 +1099,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5cd9bfa..a800290 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -31,11 +31,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - - if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { - misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; - wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { c->cpuid_level = cpuid_eax(0); get_cpu_cap(c); } @@ -129,16 +126,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) * Ingo Molnar reported a Pentium D (model 6) and a Xeon * (model 2) with the same problem. */ - if (c->x86 == 15) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - - if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { - printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); - - misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - } - } + if (c->x86 == 15) + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) + pr_info("kmemcheck: Disabling fast string operations\n"); #endif /* @@ -195,10 +186,16 @@ static void intel_smp_check(struct cpuinfo_x86 *c) } } -static void intel_workarounds(struct cpuinfo_x86 *c) +static int forcepae; +static int __init forcepae_setup(char *__unused) { - unsigned long lo, hi; + forcepae = 1; + return 1; +} +__setup("forcepae", forcepae_setup); +static void intel_workarounds(struct cpuinfo_x86 *c) +{ #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs @@ -225,16 +222,26 @@ static void intel_workarounds(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_SEP); /* + * PAE CPUID issue: many Pentium M report no PAE but may have a + * functionally usable PAE implementation. + * Forcefully enable PAE if kernel parameter "forcepae" is present. + */ + if (forcepae) { + printk(KERN_WARNING "PAE forced!\n"); + set_cpu_cap(c, X86_FEATURE_PAE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); + } + + /* * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { - rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); - if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { - printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); - printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); - lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); + if (msr_set_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) + > 0) { + pr_info("CPU: C0 stepping P4 Xeon detected.\n"); + pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); } } @@ -267,10 +274,6 @@ static void intel_workarounds(struct cpuinfo_x86 *c) } #endif -#ifdef CONFIG_X86_NUMAQ - numaq_tsc_disable(); -#endif - intel_smp_check(c); } #else diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 3656537..afa9f0d 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -47,45 +47,3 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) return NULL; } EXPORT_SYMBOL(x86_match_cpu); - -ssize_t arch_print_cpu_modalias(struct device *dev, - struct device_attribute *attr, - char *bufptr) -{ - int size = PAGE_SIZE; - int i, n; - char *buf = bufptr; - - n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" - "model:%04X:feature:", - boot_cpu_data.x86_vendor, - boot_cpu_data.x86, - boot_cpu_data.x86_model); - size -= n; - buf += n; - size -= 1; - for (i = 0; i < NCAPINTS*32; i++) { - if (boot_cpu_has(i)) { - n = snprintf(buf, size, ",%04X", i); - if (n >= size) { - WARN(1, "x86 features overflow page\n"); - break; - } - size -= n; - buf += n; - } - } - *buf++ = '\n'; - return buf - bufptr; -} - -int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (buf) { - arch_print_cpu_modalias(NULL, NULL, buf); - add_uevent_var(env, "MODALIAS=%s", buf); - kfree(buf); - } - return 0; -} diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 9f7ca26..76f98fe 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -17,6 +17,7 @@ #include <linux/hardirq.h> #include <linux/efi.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> @@ -26,10 +27,50 @@ #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> +#include <asm/timer.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); +#if IS_ENABLED(CONFIG_HYPERV) +static void (*vmbus_handler)(void); + +void hyperv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + irq_enter(); + exit_idle(); + + inc_irq_stat(irq_hv_callback_count); + if (vmbus_handler) + vmbus_handler(); + + irq_exit(); + set_irq_regs(old_regs); +} + +void hv_setup_vmbus_irq(void (*handler)(void)) +{ + vmbus_handler = handler; + /* + * Setup the IDT for hypervisor callback. Prevent reallocation + * at module reload. + */ + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + hyperv_callback_vector); +} + +void hv_remove_vmbus_irq(void) +{ + /* We have no way to deallocate the interrupt gate */ + vmbus_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +#endif + static uint32_t __init ms_hyperv_platform(void) { u32 eax; @@ -105,6 +146,11 @@ static void __init ms_hyperv_init_platform(void) if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { @@ -113,41 +159,3 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init_platform = ms_hyperv_init_platform, }; EXPORT_SYMBOL(x86_hyper_ms_hyperv); - -#if IS_ENABLED(CONFIG_HYPERV) -static int vmbus_irq = -1; -static irq_handler_t vmbus_isr; - -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ - /* - * Setup the IDT for hypervisor callback. - */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); - - vmbus_irq = irq; - vmbus_isr = handler; -} - -void hyperv_vector_handler(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc *desc; - - irq_enter(); - exit_idle(); - - desc = irq_to_desc(vmbus_irq); - - if (desc) - generic_handle_irq_desc(vmbus_irq, desc); - - irq_exit(); - set_irq_regs(old_regs); -} -#else -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ -} -#endif -EXPORT_SYMBOL_GPL(hv_register_vmbus_handler); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 79f9f84..ae407f7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -892,7 +892,6 @@ static void x86_pmu_enable(struct pmu *pmu) * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters - * step2: reprogram moved events into new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; @@ -918,6 +917,9 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu_stop(event, PERF_EF_UPDATE); } + /* + * step2: reprogram moved events into new counters + */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; @@ -1043,7 +1045,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed - * at commit time (->commit_txn) as a whole + * at commit time (->commit_txn) as a whole. */ if (cpuc->group_flag & PERF_EVENT_TXN) goto done_collect; @@ -1058,6 +1060,10 @@ static int x86_pmu_add(struct perf_event *event, int flags) memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: + /* + * Commit the collect_events() state. See x86_pmu_del() and + * x86_pmu_*_txn(). + */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; @@ -1183,28 +1189,38 @@ static void x86_pmu_del(struct perf_event *event, int flags) * If we're called during a txn, we don't need to do anything. * The events never got scheduled and ->cancel_txn will truncate * the event_list. + * + * XXX assumes any ->del() called during a TXN will only be on + * an event added during that same TXN. */ if (cpuc->group_flag & PERF_EVENT_TXN) return; + /* + * Not a TXN, therefore cleanup properly. + */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { - if (event == cpuc->event_list[i]) { + if (event == cpuc->event_list[i]) + break; + } - if (i >= cpuc->n_events - cpuc->n_added) - --cpuc->n_added; + if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ + return; - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, event); + /* If we have a newly added event; make sure to decrease n_added. */ + if (i >= cpuc->n_events - cpuc->n_added) + --cpuc->n_added; - while (++i < cpuc->n_events) - cpuc->event_list[i-1] = cpuc->event_list[i]; + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); + + /* Delete the array entry. */ + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + --cpuc->n_events; - --cpuc->n_events; - break; - } - } perf_event_update_userpage(event); } @@ -1598,7 +1614,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) { __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* - * Truncate the collected events. + * Truncate collected array by the number of events added in this + * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); @@ -1609,6 +1626,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success + * + * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 4972c24..3b2f9bd 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -130,9 +130,11 @@ struct cpu_hw_events { unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; - int n_events; - int n_added; - int n_txn; + int n_events; /* the # of events in the below arrays */ + int n_added; /* the # last events in the below arrays; + they've never been enabled yet */ + int n_txn; /* the # last events in the below arrays; + added in the current transaction */ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 28922f6..65bbbea 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); +static void uncore_pmu_event_read(struct perf_event *event); + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} + static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) { u64 count; @@ -1639,6 +1680,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = { &snb_uncore_cbox, NULL, }; + +enum { + SNB_PCI_UNCORE_IMC, +}; + +static struct uncore_event_desc snb_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"), + INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), + INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + + { /* end: all zeroes */ }, +}; + +#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff +#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48 + +/* page size multiple covering all config regs */ +#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000 + +#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1 +#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 +#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE + +static struct attribute *snb_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group snb_uncore_imc_format_group = { + .name = "format", + .attrs = snb_uncore_imc_formats_attr, +}; + +static void snb_uncore_imc_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; + resource_size_t addr; + u32 pci_dword; + + pci_read_config_dword(pdev, where, &pci_dword); + addr = pci_dword; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + pci_read_config_dword(pdev, where + 4, &pci_dword); + addr |= ((resource_size_t)pci_dword << 32); +#endif + + addr &= ~(PAGE_SIZE - 1); + + box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; +} + +static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); +} + +/* + * custom event_init() function because we define our own fixed, free + * running counters, so we do not want to conflict with generic uncore + * logic. Also simplifies processing + */ +static int snb_uncore_imc_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; + int idx, base; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) + return -EINVAL; + + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + /* + * check event is known (whitelist, determines counter) + */ + switch (cfg) { + case SNB_UNCORE_PCI_IMC_DATA_READS: + base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; + idx = UNCORE_PMC_IDX_FIXED; + break; + case SNB_UNCORE_PCI_IMC_DATA_WRITES: + base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; + idx = UNCORE_PMC_IDX_FIXED + 1; + break; + default: + return -EINVAL; + } + + /* must be done before validate_group */ + event->hw.event_base = base; + event->hw.config = cfg; + event->hw.idx = idx; + + /* no group validation needed, we have free running counters */ + + return 0; +} + +static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + return 0; +} + +static void snb_uncore_imc_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + u64 count; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + box->n_active++; + + list_add_tail(&event->active_entry, &box->active_list); + + count = snb_uncore_imc_read_counter(box, event); + local64_set(&event->hw.prev_count, count); + + if (box->n_active == 1) + uncore_pmu_start_hrtimer(box); +} + +static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + box->n_active--; + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + list_del(&event->active_entry); + + if (box->n_active == 0) + uncore_pmu_cancel_hrtimer(box); + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int snb_uncore_imc_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!box) + return -ENODEV; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + snb_uncore_imc_event_start(event, 0); + + box->n_events++; + + return 0; +} + +static void snb_uncore_imc_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + --box->n_events; + break; + } + } +} + +static int snb_pci2phy_map_init(int devid) +{ + struct pci_dev *dev = NULL; + int bus; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); + if (!dev) + return -ENOTTY; + + bus = dev->bus->number; + + pcibus_to_physid[bus] = 0; + + pci_dev_put(dev); + + return 0; +} + +static struct pmu snb_uncore_imc_pmu = { + .task_ctx_nr = perf_invalid_context, + .event_init = snb_uncore_imc_event_init, + .add = snb_uncore_imc_event_add, + .del = snb_uncore_imc_event_del, + .start = snb_uncore_imc_event_start, + .stop = snb_uncore_imc_event_stop, + .read = uncore_pmu_event_read, +}; + +static struct intel_uncore_ops snb_uncore_imc_ops = { + .init_box = snb_uncore_imc_init_box, + .enable_box = snb_uncore_imc_enable_box, + .disable_box = snb_uncore_imc_disable_box, + .disable_event = snb_uncore_imc_disable_event, + .enable_event = snb_uncore_imc_enable_event, + .hw_config = snb_uncore_imc_hw_config, + .read_counter = snb_uncore_imc_read_counter, +}; + +static struct intel_uncore_type snb_uncore_imc = { + .name = "imc", + .num_counters = 2, + .num_boxes = 1, + .fixed_ctr_bits = 32, + .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, + .event_descs = snb_uncore_imc_events, + .format_group = &snb_uncore_imc_format_group, + .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, + .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, + .ops = &snb_uncore_imc_ops, + .pmu = &snb_uncore_imc_pmu, +}; + +static struct intel_uncore_type *snb_pci_uncores[] = { + [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static struct pci_driver snb_uncore_pci_driver = { + .name = "snb_uncore", + .id_table = snb_uncore_pci_ids, +}; + +static struct pci_driver ivb_uncore_pci_driver = { + .name = "ivb_uncore", + .id_table = ivb_uncore_pci_ids, +}; + +static struct pci_driver hsw_uncore_pci_driver = { + .name = "hsw_uncore", + .id_table = hsw_uncore_pci_ids, +}; + /* end of Sandy Bridge uncore support */ /* Nehalem uncore support */ @@ -2789,6 +3173,7 @@ again: static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) { struct intel_uncore_box *box; + struct perf_event *event; unsigned long flags; int bit; @@ -2801,19 +3186,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) */ local_irq_save(flags); + /* + * handle boxes with an active event list as opposed to active + * counters + */ + list_for_each_entry(event, &box->active_list, active_entry) { + uncore_perf_event_update(box, event); + } + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) uncore_perf_event_update(box, box->events[bit]); local_irq_restore(flags); - hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); return HRTIMER_RESTART; } static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { __hrtimer_start_range_ns(&box->hrtimer, - ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + ns_to_ktime(box->hrtimer_duration), 0, HRTIMER_MODE_REL_PINNED, 0); } @@ -2847,43 +3240,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, box->cpu = -1; box->phys_id = -1; - return box; -} + /* set default hrtimer timeout */ + box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; -static struct intel_uncore_box * -uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) -{ - struct intel_uncore_box *box; + INIT_LIST_HEAD(&box->active_list); - box = *per_cpu_ptr(pmu->box, cpu); - if (box) - return box; - - raw_spin_lock(&uncore_box_lock); - list_for_each_entry(box, &pmu->box_list, list) { - if (box->phys_id == topology_physical_package_id(cpu)) { - atomic_inc(&box->refcnt); - *per_cpu_ptr(pmu->box, cpu) = box; - break; - } - } - raw_spin_unlock(&uncore_box_lock); - - return *per_cpu_ptr(pmu->box, cpu); -} - -static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) -{ - return container_of(event->pmu, struct intel_uncore_pmu, pmu); -} - -static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) -{ - /* - * perf core schedules event on the basis of cpu, uncore events are - * collected by one of the cpus inside a physical package. - */ - return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); + return box; } static int @@ -3279,16 +3641,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; - pmu->pmu = (struct pmu) { - .attr_groups = pmu->type->attr_groups, - .task_ctx_nr = perf_invalid_context, - .event_init = uncore_pmu_event_init, - .add = uncore_pmu_event_add, - .del = uncore_pmu_event_del, - .start = uncore_pmu_event_start, - .stop = uncore_pmu_event_stop, - .read = uncore_pmu_event_read, - }; + if (!pmu->type->pmu) { + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + } else { + pmu->pmu = *pmu->type->pmu; + pmu->pmu.attr_groups = pmu->type->attr_groups; + } if (pmu->type->num_boxes == 1) { if (strlen(pmu->type->name) > 0) @@ -3502,6 +3869,28 @@ static int __init uncore_pci_init(void) pci_uncores = ivt_pci_uncores; uncore_pci_driver = &ivt_uncore_pci_driver; break; + case 42: /* Sandy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &snb_uncore_pci_driver; + break; + case 58: /* Ivy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &ivb_uncore_pci_driver; + break; + case 60: /* Haswell */ + case 69: /* Haswell Celeron */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &hsw_uncore_pci_driver; + break; default: return 0; } @@ -3773,7 +4162,7 @@ static void __init uncore_cpu_setup(void *dummy) static int __init uncore_cpu_init(void) { - int ret, cpu, max_cores; + int ret, max_cores; max_cores = boot_cpu_data.x86_max_cores; switch (boot_cpu_data.x86_model) { @@ -3817,29 +4206,6 @@ static int __init uncore_cpu_init(void) if (ret) return ret; - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) { - int i, phys_id = topology_physical_package_id(cpu); - - for_each_cpu(i, &uncore_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) { - phys_id = -1; - break; - } - } - if (phys_id < 0) - continue; - - uncore_cpu_prepare(cpu, phys_id); - uncore_event_init_cpu(cpu); - } - on_each_cpu(uncore_cpu_setup, NULL, 1); - - __register_cpu_notifier(&uncore_cpu_nb); - - cpu_notifier_register_done(); - return 0; } @@ -3868,6 +4234,41 @@ static int __init uncore_pmus_register(void) return 0; } +static void __init uncore_cpumask_init(void) +{ + int cpu; + + /* + * ony invoke once from msr or pci init code + */ + if (!cpumask_empty(&uncore_cpu_mask)) + return; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + __register_cpu_notifier(&uncore_cpu_nb); + + cpu_notifier_register_done(); +} + + static int __init intel_uncore_init(void) { int ret; @@ -3886,6 +4287,7 @@ static int __init intel_uncore_init(void) uncore_pci_exit(); goto fail; } + uncore_cpumask_init(); uncore_pmus_register(); return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index a80ab71..90236f0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -6,6 +6,7 @@ #define UNCORE_PMU_NAME_LEN 32 #define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) +#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC) #define UNCORE_FIXED_EVENT 0xff #define UNCORE_PMC_IDX_MAX_GENERIC 8 @@ -440,6 +441,7 @@ struct intel_uncore_type { struct intel_uncore_ops *ops; struct uncore_event_desc *event_descs; const struct attribute_group *attr_groups[4]; + struct pmu *pmu; /* for custom pmu ops */ }; #define pmu_group attr_groups[0] @@ -488,8 +490,11 @@ struct intel_uncore_box { u64 tags[UNCORE_PMC_IDX_MAX]; struct pci_dev *pci_dev; struct intel_uncore_pmu *pmu; + u64 hrtimer_duration; /* hrtimer timeout for this box */ struct hrtimer hrtimer; struct list_head list; + struct list_head active_list; + void *io_addr; struct intel_uncore_extra_reg shared_regs[0]; }; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 3486e66..5d466b7 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1257,7 +1257,24 @@ again: pass++; goto again; } - + /* + * Perf does test runs to see if a whole group can be assigned + * together succesfully. There can be multiple rounds of this. + * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config + * bits, such that the next round of group assignments will + * cause the above p4_should_swap_ts to pass instead of fail. + * This leads to counters exclusive to thread0 being used by + * thread1. + * + * Solve this with a cheap hack, reset the idx back to -1 to + * force a new lookup (p4_next_cntr) to get the right counter + * for the right thread. + * + * This probably doesn't comply with the general spirit of how + * perf wants to work, but P4 is special. :-( + */ + if (p4_should_swap_ts(hwc->config, cpu)) + hwc->idx = -1; p4_pmu_swap_config_ts(hwc, cpu); if (assign) assign[i] = cntr_idx; @@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = { __init int p4_pmu_init(void) { unsigned int low, high; + int i, reg; /* If we get stripped -- indexing fails */ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); @@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void) x86_pmu = p4_pmu; + /* + * Even though the counters are configured to interrupt a particular + * logical processor when an overflow happens, testing has shown that + * on kdump kernels (which uses a single cpu), thread1's counter + * continues to run and will report an NMI on thread0. Due to the + * overflow bug, this leads to a stream of unknown NMIs. + * + * Solve this by zero'ing out the registers to mimic a reset. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + wrmsrl_safe(reg, 0ULL); + } + return 0; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a57902e..507de80 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -57,9 +57,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; -#endif -#ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2a1770..5abd4cd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,12 +16,35 @@ #include <asm/stacktrace.h> +static void *is_irq_stack(void *p, void *irq) +{ + if (p < irq || p >= (irq + THREAD_SIZE)) + return NULL; + return irq + THREAD_SIZE; +} + + +static void *is_hardirq_stack(unsigned long *stack, int cpu) +{ + void *irq = per_cpu(hardirq_stack, cpu); + + return is_irq_stack(stack, irq); +} + +static void *is_softirq_stack(unsigned long *stack, int cpu) +{ + void *irq = per_cpu(softirq_stack, cpu); + + return is_irq_stack(stack, irq); +} void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + const unsigned cpu = get_cpu(); int graph = 0; + u32 *prev_esp; if (!task) task = current; @@ -30,7 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long dummy; stack = &dummy; - if (task && task != current) + if (task != current) stack = (unsigned long *)task->thread.sp; } @@ -39,18 +62,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, for (;;) { struct thread_info *context; + void *end_stack; + + end_stack = is_hardirq_stack(stack, cpu); + if (!end_stack) + end_stack = is_softirq_stack(stack, cpu); - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); + context = task_thread_info(task); + bp = ops->walk_stack(context, stack, bp, ops, data, + end_stack, &graph); - stack = (unsigned long *)context->previous_esp; + /* Stop if not on irq stack */ + if (!end_stack) + break; + + /* The previous esp is saved on the bottom of the stack */ + prev_esp = (u32 *)(end_stack - THREAD_SIZE); + stack = (unsigned long *)*prev_esp; if (!stack) break; + if (ops->stack(data, "IRQ") < 0) break; touch_nmi_watchdog(); } + put_cpu(); } EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index addb207..1abcb50 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -104,6 +104,44 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, return (stack >= irq_stack && stack < irq_stack_end); } +static const unsigned long irq_stack_size = + (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); + +enum stack_type { + STACK_IS_UNKNOWN, + STACK_IS_NORMAL, + STACK_IS_EXCEPTION, + STACK_IS_IRQ, +}; + +static enum stack_type +analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, + unsigned long **stack_end, unsigned long *irq_stack, + unsigned *used, char **id) +{ + unsigned long addr; + + addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); + if ((unsigned long)task_stack_page(task) == addr) + return STACK_IS_NORMAL; + + *stack_end = in_exception_stack(cpu, (unsigned long)stack, + used, id); + if (*stack_end) + return STACK_IS_EXCEPTION; + + if (!irq_stack) + return STACK_IS_NORMAL; + + *stack_end = irq_stack; + irq_stack = irq_stack - irq_stack_size; + + if (in_irq_stack(stack, irq_stack, *stack_end)) + return STACK_IS_IRQ; + + return STACK_IS_UNKNOWN; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -116,12 +154,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); - unsigned long *irq_stack_end = - (unsigned long *)per_cpu(irq_stack_ptr, cpu); - unsigned used = 0; struct thread_info *tinfo; - int graph = 0; + unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); unsigned long dummy; + unsigned used = 0; + int graph = 0; + int done = 0; if (!task) task = current; @@ -143,49 +181,61 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * exceptions */ tinfo = task_thread_info(task); - for (;;) { + while (!done) { + unsigned long *stack_end; + enum stack_type stype; char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - if (estack_end) { + stype = analyze_stack(cpu, task, stack, &stack_end, + irq_stack, &used, &id); + + /* Default finish unless specified to continue */ + done = 1; + + switch (stype) { + + /* Break out early if we are on the thread stack */ + case STACK_IS_NORMAL: + break; + + case STACK_IS_EXCEPTION: + if (ops->stack(data, id) < 0) break; bp = ops->walk_stack(tinfo, stack, bp, ops, - data, estack_end, &graph); + data, stack_end, &graph); ops->stack(data, "<EOE>"); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the * exception stack: */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irq_stack_end) { - unsigned long *irq_stack; - irq_stack = irq_stack_end - - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - - if (in_irq_stack(stack, irq_stack, irq_stack_end)) { - if (ops->stack(data, "IRQ") < 0) - break; - bp = ops->walk_stack(tinfo, stack, bp, - ops, data, irq_stack_end, &graph); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irq_stack_end[-1]); - irq_stack_end = NULL; - ops->stack(data, "EOI"); - continue; - } + stack = (unsigned long *) stack_end[-2]; + done = 0; + break; + + case STACK_IS_IRQ: + + if (ops->stack(data, "IRQ") < 0) + break; + bp = ops->walk_stack(tinfo, stack, bp, + ops, data, stack_end, &graph); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (stack_end[-1]); + irq_stack = NULL; + ops->stack(data, "EOI"); + done = 0; + break; + + case STACK_IS_UNKNOWN: + ops->stack(data, "UNK"); + break; } - break; } /* diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bc4a088..6d7d5a1 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -203,18 +203,15 @@ static void __init intel_remapping_check(int num, int slot, int func) revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); /* - * Revision 13 of all triggering devices id in this quirk have - * a problem draining interrupts when irq remapping is enabled, - * and should be flagged as broken. Additionally revisions 0x12 - * and 0x22 of device id 0x3405 has this problem. + * Revision <= 13 of all triggering devices id in this quirk + * have a problem draining interrupts when irq remapping is + * enabled, and should be flagged as broken. Additionally + * revision 0x22 of device id 0x3405 has this problem. */ - if (revision == 0x13) + if (revision <= 0x13) set_irq_remapping_broken(); - else if ((device == 0x3405) && - ((revision == 0x12) || - (revision == 0x22))) + else if (device == 0x3405 && revision == 0x22) set_irq_remapping_broken(); - } /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index e625319..52819e8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -308,7 +308,10 @@ static int ftrace_write(unsigned long ip, const char *val, int size) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa_symbol(ip)); - return probe_kernel_write((void *)ip, val, size); + if (probe_kernel_write((void *)ip, val, size)) + return -EPERM; + + return 0; } static int add_break(unsigned long ip, const char *old) @@ -323,10 +326,7 @@ static int add_break(unsigned long ip, const char *old) if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) return -EINVAL; - if (ftrace_write(ip, &brk, 1)) - return -EPERM; - - return 0; + return ftrace_write(ip, &brk, 1); } static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) @@ -425,7 +425,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec) /* If this does not have a breakpoint, we are done */ if (ins[0] != brk) - return -1; + return 0; nop = ftrace_nop_replace(); @@ -455,7 +455,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec) } update: - return probe_kernel_write((void *)ip, &nop[0], 1); + return ftrace_write(ip, nop, 1); } static int add_update_code(unsigned long ip, unsigned const char *new) @@ -463,9 +463,7 @@ static int add_update_code(unsigned long ip, unsigned const char *new) /* skip breakpoint */ ip++; new++; - if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) - return -EPERM; - return 0; + return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1); } static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) @@ -520,10 +518,7 @@ static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) new = ftrace_call_replace(ip, addr); - if (ftrace_write(ip, new, 1)) - return -EPERM; - - return 0; + return ftrace_write(ip, new, 1); } static int finish_update_nop(struct dyn_ftrace *rec) @@ -533,9 +528,7 @@ static int finish_update_nop(struct dyn_ftrace *rec) new = ftrace_nop_replace(); - if (ftrace_write(ip, new, 1)) - return -EPERM; - return 0; + return ftrace_write(ip, new, 1); } static int finish_update(struct dyn_ftrace *rec, int enable) @@ -632,8 +625,14 @@ void ftrace_replace_code(int enable) printk(KERN_WARNING "Failed on %s (%d):\n", report, count); for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - remove_breakpoint(rec); + /* + * Breakpoints are handled only when this function is in + * progress. The system could not work with them. + */ + if (remove_breakpoint(rec)) + BUG(); } + run_sync(); } static int @@ -655,16 +654,19 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code, run_sync(); ret = ftrace_write(ip, new_code, 1); - if (ret) { - ret = -EPERM; - goto out; - } - run_sync(); + /* + * The breakpoint is handled only when this function is in progress. + * The system could not work if we could not remove it. + */ + BUG_ON(ret); out: + run_sync(); return ret; fail_update: - probe_kernel_write((void *)ip, &old_code[0], 1); + /* Also here the system could not work with the breakpoint */ + if (ftrace_write(ip, old_code, 1)) + BUG(); goto out; } @@ -678,11 +680,8 @@ void arch_ftrace_update_code(int command) atomic_dec(&modifying_ftrace_code); } -int __init ftrace_dyn_arch_init(void *data) +int __init ftrace_dyn_arch_init(void) { - /* The return code is retured via data */ - *(unsigned long *)data = 0; - return 0; } #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index d89382b..8d80ae0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -521,7 +521,7 @@ static int hpet_setup_irq(struct hpet_dev *dev) { if (request_irq(dev->irq, hpet_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + IRQF_TIMER | IRQF_NOBALANCING, dev->name, dev)) return -1; @@ -699,7 +699,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); wait_for_completion(&work.complete); - destroy_timer_on_stack(&work.work.timer); + destroy_delayed_work_on_stack(&work.work); break; case CPU_DEAD: if (hdev) { @@ -752,9 +752,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, -#ifdef CONFIG_X86_64 .archdata = { .vclock_mode = VCLOCK_HPET }, -#endif }; static int hpet_clocksource_register(void) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d99f31d..42805fa 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -125,6 +125,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_printf(p, " Machine check polls\n"); #endif +#if defined(CONFIG_HYPERV) || defined(CONFIG_XEN) + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); + seq_printf(p, " Hypervisor callback interrupts\n"); +#endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index d7fcbed..63ce838 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -55,16 +55,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -/* - * per-CPU IRQ handling contexts (thread information and stack) - */ -union irq_ctx { - struct thread_info tinfo; - u32 stack[THREAD_SIZE/sizeof(u32)]; -} __attribute__((aligned(THREAD_SIZE))); - -static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); -static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack); static void call_on_stack(void *func, void *stack) { @@ -77,14 +69,26 @@ static void call_on_stack(void *func, void *stack) : "memory", "cc", "edx", "ecx", "eax"); } +/* how to get the current stack pointer from C */ +#define current_stack_pointer ({ \ + unsigned long sp; \ + asm("mov %%esp,%0" : "=g" (sp)); \ + sp; \ +}) + +static inline void *current_stack(void) +{ + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); +} + static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { - union irq_ctx *curctx, *irqctx; - u32 *isp, arg1, arg2; + struct irq_stack *curstk, *irqstk; + u32 *isp, *prev_esp, arg1, arg2; - curctx = (union irq_ctx *) current_thread_info(); - irqctx = __this_cpu_read(hardirq_ctx); + curstk = (struct irq_stack *) current_stack(); + irqstk = __this_cpu_read(hardirq_stack); /* * this is where we switch to the IRQ stack. However, if we are @@ -92,13 +96,14 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) * handler) we can't do that and just have to keep using the * current stack (which is the irq stack already after all) */ - if (unlikely(curctx == irqctx)) + if (unlikely(curstk == irqstk)) return 0; - /* build the stack frame on the IRQ stack */ - isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); - irqctx->tinfo.task = curctx->tinfo.task; - irqctx->tinfo.previous_esp = current_stack_pointer; + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Save the next esp at the bottom of the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -118,46 +123,40 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) */ void irq_ctx_init(int cpu) { - union irq_ctx *irqctx; + struct irq_stack *irqstk; - if (per_cpu(hardirq_ctx, cpu)) + if (per_cpu(hardirq_stack, cpu)) return; - irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), THREADINFO_GFP, THREAD_SIZE_ORDER)); - memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - - per_cpu(hardirq_ctx, cpu) = irqctx; + per_cpu(hardirq_stack, cpu) = irqstk; - irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), THREADINFO_GFP, THREAD_SIZE_ORDER)); - memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - - per_cpu(softirq_ctx, cpu) = irqctx; + per_cpu(softirq_stack, cpu) = irqstk; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); + cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); } void do_softirq_own_stack(void) { - struct thread_info *curctx; - union irq_ctx *irqctx; - u32 *isp; + struct thread_info *curstk; + struct irq_stack *irqstk; + u32 *isp, *prev_esp; - curctx = current_thread_info(); - irqctx = __this_cpu_read(softirq_ctx); - irqctx->tinfo.task = curctx->task; - irqctx->tinfo.previous_esp = current_stack_pointer; + curstk = current_stack(); + irqstk = __this_cpu_read(softirq_stack); /* build the stack frame on the softirq stack */ - isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Push the previous esp onto the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 713f1b3..0331cb3 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -417,7 +417,6 @@ void kvm_disable_steal_time(void) #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { - WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e604109..d9156ce 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -242,7 +242,7 @@ void __init kvmclock_init(void) hv_clock = __va(mem); memset(hv_clock, 0, size); - if (kvm_register_clock("boot clock")) { + if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; memblock_free(mem, size); return; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 18be1893..e69f988 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/jump_label.h> +#include <linux/random.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -43,13 +44,52 @@ do { \ } while (0) #endif +#ifdef CONFIG_RANDOMIZE_BASE +static unsigned long module_load_offset; +static int randomize_modules = 1; + +/* Mutex protects the module_load_offset. */ +static DEFINE_MUTEX(module_kaslr_mutex); + +static int __init parse_nokaslr(char *p) +{ + randomize_modules = 0; + return 0; +} +early_param("nokaslr", parse_nokaslr); + +static unsigned long int get_module_load_offset(void) +{ + if (randomize_modules) { + mutex_lock(&module_kaslr_mutex); + /* + * Calculate the module_load_offset the first time this + * code is called. Once calculated it stays the same until + * reboot. + */ + if (module_load_offset == 0) + module_load_offset = + (get_random_int() % 1024 + 1) * PAGE_SIZE; + mutex_unlock(&module_kaslr_mutex); + } + return module_load_offset; +} +#else +static unsigned long int get_module_load_offset(void) +{ + return 0; +} +#endif + void *module_alloc(unsigned long size) { if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node_range(size, 1, + MODULES_VADDR + get_module_load_offset(), + MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + PAGE_KERNEL_EXEC, NUMA_NO_NODE, + __builtin_return_address(0)); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 6fcb49c..b4872b9 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; + static int __init nmi_warning_debugfs(void) { debugfs_create_u64("nmi_longest_ns", 0644, @@ -95,6 +96,20 @@ static int __init nmi_warning_debugfs(void) } fs_initcall(nmi_warning_debugfs); +static void nmi_max_handler(struct irq_work *w) +{ + struct nmiaction *a = container_of(w, struct nmiaction, irq_work); + int remainder_ns, decimal_msecs; + u64 whole_msecs = ACCESS_ONCE(a->max_duration); + + remainder_ns = do_div(whole_msecs, (1000 * 1000)); + decimal_msecs = remainder_ns / 1000; + + printk_ratelimited(KERN_INFO + "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + a->handler, whole_msecs, decimal_msecs); +} + static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); @@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 * to handle those situations. */ list_for_each_entry_rcu(a, &desc->head, list) { - u64 before, delta, whole_msecs; - int remainder_ns, decimal_msecs, thishandled; + int thishandled; + u64 delta; - before = sched_clock(); + delta = sched_clock(); thishandled = a->handler(type, regs); handled += thishandled; - delta = sched_clock() - before; + delta = sched_clock() - delta; trace_nmi_handler(a->handler, (int)delta, thishandled); - if (delta < nmi_longest_ns) + if (delta < nmi_longest_ns || delta < a->max_duration) continue; - nmi_longest_ns = delta; - whole_msecs = delta; - remainder_ns = do_div(whole_msecs, (1000 * 1000)); - decimal_msecs = remainder_ns / 1000; - printk_ratelimited(KERN_INFO - "INFO: NMI handler (%ps) took too long to run: " - "%lld.%03d msecs\n", a->handler, whole_msecs, - decimal_msecs); + a->max_duration = delta; + irq_work_queue(&a->irq_work); } rcu_read_unlock(); @@ -146,6 +155,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) if (!action->handler) return -EINVAL; + init_irq_work(&action->irq_work, nmi_max_handler); + spin_lock_irqsave(&desc->lock, flags); /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3fb8d95..4505e2a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void) */ void arch_cpu_idle(void) { - if (cpuidle_idle_call()) - x86_idle(); - else - local_irq_enable(); + x86_idle(); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0de43e9..7bc86bb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -314,6 +314,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); + this_cpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + /* * Restore %gs if needed (which is common) */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7461f50..678c0ad 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -184,14 +184,14 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) { unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); unsigned long sp = (unsigned long)®s->sp; - struct thread_info *tinfo; + u32 *prev_esp; if (context == (sp & ~(THREAD_SIZE - 1))) return sp; - tinfo = (struct thread_info *)context; - if (tinfo->previous_esp) - return tinfo->previous_esp; + prev_esp = (u32 *)(context); + if (prev_esp) + return (unsigned long)prev_esp; return (unsigned long)regs; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c752cb4..654b465 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -464,9 +464,12 @@ void __attribute__((weak)) mach_reboot_fixups(void) * 2) If still alive, write to the keyboard controller * 3) If still alive, write to the ACPI reboot register again * 4) If still alive, write to the keyboard controller again + * 5) If still alive, call the EFI runtime service to reboot + * 6) If still alive, write to the PCI IO port 0xCF9 to reboot + * 7) If still alive, inform BIOS to do a proper reboot * * If the machine is still alive at this stage, it gives up. We default to - * following the same pattern, except that if we're still alive after (4) we'll + * following the same pattern, except that if we're still alive after (7) we'll * try to force a triple fault and then cycle between hitting the keyboard * controller and doing that */ @@ -502,7 +505,7 @@ static void native_machine_emergency_restart(void) attempt = 1; reboot_type = BOOT_ACPI; } else { - reboot_type = BOOT_TRIPLE; + reboot_type = BOOT_EFI; } break; @@ -510,13 +513,15 @@ static void native_machine_emergency_restart(void) load_idt(&no_idt); __asm__ __volatile__("int3"); + /* We're probably dead after this, but... */ reboot_type = BOOT_KBD; break; case BOOT_BIOS: machine_real_restart(MRR_BIOS); - reboot_type = BOOT_KBD; + /* We're probably dead after this, but... */ + reboot_type = BOOT_TRIPLE; break; case BOOT_ACPI: @@ -530,7 +535,7 @@ static void native_machine_emergency_restart(void) EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - reboot_type = BOOT_KBD; + reboot_type = BOOT_CF9_COND; break; case BOOT_CF9: @@ -548,7 +553,7 @@ static void native_machine_emergency_restart(void) outb(cf9|reboot_code, 0xcf9); udelay(50); } - reboot_type = BOOT_KBD; + reboot_type = BOOT_BIOS; break; } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ce72964..09c76d2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -869,7 +869,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); - visws_early_detect(); /* * copy kernel address range established so far and switch @@ -926,11 +925,11 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL32", 4)) { - set_bit(EFI_BOOT, &x86_efi_facility); + set_bit(EFI_BOOT, &efi.flags); } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL64", 4)) { - set_bit(EFI_BOOT, &x86_efi_facility); - set_bit(EFI_64BIT, &x86_efi_facility); + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); } if (efi_enabled(EFI_BOOT)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a32da80..3482693 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -122,8 +122,9 @@ static void smp_callin(void) * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. */ cpuid = smp_processor_id(); - if (apic->wait_for_init_deassert && cpuid != 0) - apic->wait_for_init_deassert(&init_deasserted); + if (apic->wait_for_init_deassert && cpuid) + while (!atomic_read(&init_deasserted)) + cpu_relax(); /* * (This works even if the APIC is not enabled.) @@ -701,11 +702,15 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, int id; int boot_error; + preempt_disable(); + /* * Wake up AP by INIT, INIT, STARTUP sequence. */ - if (cpu) - return wakeup_secondary_cpu_via_init(apicid, start_ip); + if (cpu) { + boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + goto out; + } /* * Wake up BSP by nmi. @@ -725,6 +730,9 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); } +out: + preempt_enable(); + return boot_error; } @@ -758,10 +766,10 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) #else clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); +#endif per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; @@ -1379,7 +1387,7 @@ static inline void mwait_play_dead(void) if (!this_cpu_has(X86_FEATURE_MWAIT)) return; - if (!this_cpu_has(X86_FEATURE_CLFLSH)) + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) return; if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 24d3c91..bf7ef5c 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; +__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) @@ -62,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, .name = "timer" }; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfbe99f..57e5ce1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -914,8 +914,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); @@ -985,9 +984,7 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, -#ifdef CONFIG_X86_64 .archdata = { .vclock_mode = VCLOCK_TSC }, -#endif }; void mark_tsc_unstable(char *reason) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index da6b35a..49edf2d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -147,7 +147,6 @@ SECTIONS _edata = .; } :data -#ifdef CONFIG_X86_64 . = ALIGN(PAGE_SIZE); __vvar_page = .; @@ -165,12 +164,15 @@ SECTIONS #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR + /* + * Pad the rest of the page with zeros. Otherwise the loader + * can leave garbage here. + */ + . = __vvar_beginning_hack + PAGE_SIZE; } :data . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); -#endif /* CONFIG_X86_64 */ - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 556eaf2..8b3b3eb 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -47,14 +47,12 @@ #include <asm/segment.h> #include <asm/desc.h> #include <asm/topology.h> -#include <asm/vgtod.h> #include <asm/traps.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -77,48 +75,6 @@ static int __init vsyscall_setup(char *str) } early_param("vsyscall", vsyscall_setup); -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.sys_tz = sys_tz; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - write_seqcount_begin(&vdata->seq); - - /* copy vsyscall data */ - vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->clock->cycle_last; - vdata->clock.mask = tk->clock->mask; - vdata->clock.mult = tk->mult; - vdata->clock.shift = tk->shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec - << tk->shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse.tv_sec = tk->xtime_sec; - vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); - - vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse, - tk->wall_to_monotonic); - - write_seqcount_end(&vdata->seq); -} - static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { @@ -374,7 +330,6 @@ void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c new file mode 100644 index 0000000..f9c6e56 --- /dev/null +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include <linux/timekeeper_internal.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->clock->archdata.vclock_mode; + vdata->cycle_last = tk->clock->cycle_last; + vdata->mask = tk->clock->mask; + vdata->mult = tk->mult; + vdata->shift = tk->shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + (tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} |