From cbf74cea070fa1f705de4712e25d9e56ae6543c7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 30 May 2011 16:31:11 +0200 Subject: oprofile, x86: Add comments to IBS LVT offset initialization Adding a comment in the code as IBS LVT setup is not obvious at all ... Signed-off-by: Robert Richter --- arch/x86/kernel/apic/apic.c | 3 ++- arch/x86/oprofile/op_model_amd.c | 13 +++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index fabf01e..a0bf78a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -390,7 +390,8 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new) /* * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. + * enables the vector. See also the BKDGs. Must be called with + * preemption disabled. */ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 9fd8a56..9cbb710 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -609,16 +609,21 @@ static int setup_ibs_ctl(int ibs_eilvt_off) return 0; } +/* + * This runs only on the current cpu. We try to find an LVT offset and + * setup the local APIC. For this we must disable preemption. On + * success we initialize all nodes with this offset. This updates then + * the offset in the IBS_CTL per-node msr. The per-core APIC setup of + * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_- + * amd_cpu_shutdown() using the new offset. + */ static int force_ibs_eilvt_setup(void) { int offset; int ret; - /* - * find the next free available EILVT entry, skip offset 0, - * pin search to this cpu - */ preempt_disable(); + /* find the next free available EILVT entry, skip offset 0 */ for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { if (get_eilvt(offset)) break; -- cgit v1.1 From 6e33a852a37dee02979ec9d82bea26c07cee5bce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20N=C3=A9meth?= Date: Sat, 14 May 2011 19:27:33 +0200 Subject: x86/PCI/ACPI: fix type mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The flags field of struct resource from linux/ioport.h is "unsigned long". Change the "type" parameter of coalesce_windows() function to match that field. This fixes the following warning messages when compiling with "make C=1 W=1 bzImage modules": arch/x86/pci/acpi.c: In function ‘coalesce_windows’: arch/x86/pci/acpi.c:198: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result arch/x86/pci/acpi.c:203: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result Signed-off-by: Márton Németh Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 0972315..68c3c13 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -188,7 +188,7 @@ static bool resource_contains(struct resource *res, resource_size_t point) return false; } -static void coalesce_windows(struct pci_root_info *info, int type) +static void coalesce_windows(struct pci_root_info *info, unsigned long type) { int i, j; struct resource *res1, *res2; -- cgit v1.1 From fd8a7de177b6f56a0fc59ad211c197a7df06b1ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jul 2010 14:34:50 +0200 Subject: x86: cpu-hotplug: Prevent softirq wakeup on wrong CPU After a newly plugged CPU sets the cpu_online bit it enables interrupts and goes idle. The cpu which brought up the new cpu waits for the cpu_online bit and when it observes it, it sets the cpu_active bit for this cpu. The cpu_active bit is the relevant one for the scheduler to consider the cpu as a viable target. With forced threaded interrupt handlers which imply forced threaded softirqs we observed the following race: cpu 0 cpu 1 bringup(cpu1); set_cpu_online(smp_processor_id(), true); local_irq_enable(); while (!cpu_online(cpu1)); timer_interrupt() -> wake_up(softirq_thread_cpu1); -> enqueue_on(softirq_thread_cpu1, cpu0); ^^^^ cpu_notify(CPU_ONLINE, cpu1); -> sched_cpu_active(cpu1) -> set_cpu_active((cpu1, true); When an interrupt happens before the cpu_active bit is set by the cpu which brought up the newly onlined cpu, then the scheduler refuses to enqueue the woken thread which is bound to that newly onlined cpu on that newly onlined cpu due to the not yet set cpu_active bit and selects a fallback runqueue. Not really an expected and desirable behaviour. So far this has only been observed with forced hard/softirq threading, but in theory this could happen without forced threaded hard/softirqs as well. It's probably unobservable as it would take a massive interrupt storm on the newly onlined cpu which causes the softirq loop to wake up the softirq thread and an even longer delay of the cpu which waits for the cpu_online bit. Signed-off-by: Thomas Gleixner Reviewed-by: Peter Zijlstra Cc: stable@kernel.org # 2.6.39 --- arch/x86/kernel/smpboot.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33a0c11..9fd3137 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -285,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused) per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); + /* + * Wait until the cpu which brought this one up marked it + * online before enabling interrupts. If we don't do that then + * we can end up waking up the softirq thread before this cpu + * reached the active state, which makes the scheduler unhappy + * and schedule the softirq thread on the wrong cpu. This is + * only observable with forced threaded interrupts, but in + * theory it could also happen w/o them. It's just way harder + * to achieve. + */ + while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) + cpu_relax(); + /* enable local interrupts */ local_irq_enable(); -- cgit v1.1 From a91d92875ee94e4703fd017ccaadb48cfb344994 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 3 Jun 2011 09:51:34 +0000 Subject: xen: partially revert "xen: set max_pfn_mapped to the last pfn mapped" We only need to set max_pfn_mapped to the last pfn mapped on x86_64 to make sure that cleanup_highmap doesn't remove important mappings at _end. We don't need to do this on x86_32 because cleanup_highmap is not called on x86_32. Besides lowering max_pfn_mapped on x86_32 has the unwanted side effect of limiting the amount of memory available for the 1:1 kernel pagetable allocation. This patch reverts the x86_32 part of the original patch. CC: stable@kernel.org Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dc708dc..afe1d54 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1599,6 +1599,11 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; +#ifdef CONFIG_X86_32 + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; +#endif + if (!pte_none(pte_page[pteidx])) continue; @@ -1766,7 +1771,9 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + + xen_start_info->nr_pt_frames * PAGE_SIZE + + 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); -- cgit v1.1 From 977cb76d52e7aa040e18a84b29fe6fd80d79319b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 6 Jun 2011 10:15:49 +0200 Subject: x86: devicetree: Add missing early_init_dt_setup_initrd_arch stub This patch fixes the following build failure: drivers/built-in.o: In function `early_init_dt_check_for_initrd': /home/florian/dev/kernel/x86/linux-2.6-x86/drivers/of/fdt.c:571: undefined reference to `early_init_dt_setup_initrd_arch' make: *** [.tmp_vmlinux1] Error 1 which happens as soon as we enable initrd support on a x86 devicetree platform such as Intel CE4100. Signed-off-by: Florian Fainelli Acked-by: Grant Likely Cc: Maxime Bizon Acked-by: Sebastian Andrzej Siewior Cc: stable@kernel.org # 2.6.39 Link: http://lkml.kernel.org/r/201106061015.50039.ffainelli@freebox.fr Signed-off-by: Thomas Gleixner --- arch/x86/kernel/devicetree.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 690bc84..9aeb78a 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -98,6 +99,16 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); } +#ifdef CONFIG_BLK_DEV_INITRD +void __init early_init_dt_setup_initrd_arch(unsigned long start, + unsigned long end) +{ + initrd_start = (unsigned long)__va(start); + initrd_end = (unsigned long)__va(end); + initrd_below_start_ok = 1; +} +#endif + void __init add_dtb(u64 data) { initial_dtb = data + offsetof(struct setup_data, data); -- cgit v1.1 From 7ad35cf288fd63a19bf50e490440a992de808b2b Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 25 May 2011 14:00:49 +1000 Subject: x86/uv/x2apic: update for change in pci bridge handling. When I added 3448a19da479b6bd1e28e2a2be9fa16c6a6feb39 I forgot about the special uv handling code for this, so this patch fixes it up. Acked-by: Jesse Barnes Acked-by: Ingo Molnar Signed-off-by: Dave Airlie --- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b511a01..adc66c3 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -632,14 +632,14 @@ late_initcall(uv_init_heartbeat); /* Direct Legacy VGA I/O traffic to designated IOH */ int uv_set_vga_state(struct pci_dev *pdev, bool decode, - unsigned int command_bits, bool change_bridge) + unsigned int command_bits, u32 flags) { int domain, bus, rc; - PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", - pdev->devfn, decode, command_bits, change_bridge); + PR_DEVEL("devfn %x decode %d cmd %x flags %d\n", + pdev->devfn, decode, command_bits, flags); - if (!change_bridge) + if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) return 0; if ((command_bits & PCI_COMMAND_IO) == 0) -- cgit v1.1 From 60b8b1de0dd2bf246f0e074d287bb3f0bc42a755 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 14 Jun 2011 12:45:10 -0700 Subject: x86 idle: APM requires pm_idle/default_idle unconditionally when a module [ Also from Ben Hutchings and Vitaliy Ivanov ] Commit 06ae40ce073d ("x86 idle: EXPORT_SYMBOL(default_idle, pm_idle) only when APM demands it") removed the export for pm_idle/default_idle unless the apm module was modularised and CONFIG_APM_CPU_IDLE was set. But the apm module uses pm_idle/default_idle unconditionally, CONFIG_APM_CPU_IDLE only affects the bios idle threshold. Adjust the export accordingly. [ Used #ifdef instead of #if defined() as it's shorter, and what both Ben and Vitaliy used.. Andy, you're out-voted ;) - Linus ] Reported-by: Randy Dunlap Acked-by: Jiri Kosina Acked-by: Ingo Molnar Acked-by: Len Brown Signed-off-by: Andy Whitcroft Signed-off-by: Vitaliy Ivanov Signed-off-by: Ben Hutchings Signed-off-by: Linus Torvalds --- arch/x86/kernel/process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2e4928d..e1ba8cb2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -337,7 +337,7 @@ EXPORT_SYMBOL(boot_option_idle_override); * Powermanagement idle function, if any.. */ void (*pm_idle)(void); -#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE) +#ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(pm_idle); #endif @@ -399,7 +399,7 @@ void default_idle(void) cpu_relax(); } } -#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE) +#ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); #endif -- cgit v1.1 From 8fe7e94eb71430cf63a742f3c19739d82a662758 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 1 Jun 2011 15:31:44 +0200 Subject: oprofile, x86: Fix race in nmi handler while starting counters In some rare cases, nmis are generated immediately after the nmi handler of the cpu was started. This causes the counter not to be enabled. Before enabling the nmi handlers we need to set variable ctr_running first and make sure its value is written to memory. Also, the patch makes all existing barriers a memory barrier instead of a compiler barrier only. Reported-by: Suravee Suthikulpanit Cc: # .35+ Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index cf97500..68894fd 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -112,8 +112,10 @@ static void nmi_cpu_start(void *dummy) static int nmi_start(void) { get_online_cpus(); - on_each_cpu(nmi_cpu_start, NULL, 1); ctr_running = 1; + /* make ctr_running visible to the nmi handler: */ + smp_mb(); + on_each_cpu(nmi_cpu_start, NULL, 1); put_online_cpus(); return 0; } @@ -504,15 +506,18 @@ static int nmi_setup(void) nmi_enabled = 0; ctr_running = 0; - barrier(); + /* make variables visible to the nmi handler: */ + smp_mb(); err = register_die_notifier(&profile_exceptions_nb); if (err) goto fail; get_online_cpus(); register_cpu_notifier(&oprofile_cpu_nb); - on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; + /* make nmi_enabled visible to the nmi handler: */ + smp_mb(); + on_each_cpu(nmi_cpu_setup, NULL, 1); put_online_cpus(); return 0; @@ -531,7 +536,8 @@ static void nmi_shutdown(void) nmi_enabled = 0; ctr_running = 0; put_online_cpus(); - barrier(); + /* make variables visible to the nmi handler: */ + smp_mb(); unregister_die_notifier(&profile_exceptions_nb); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); -- cgit v1.1 From 900cba8881b39dfbc7c8062098504ab93f5387a8 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 18 Dec 2009 10:31:31 +0100 Subject: xen: support CONFIG_MAXSMP The MAXSMP config option requires CPUMASK_OFFSTACK, which in turn requires we init the memory for the maps while we bring up the cpus. MAXSMP also increases NR_CPUS to 4096. This increase in size exposed an issue in the argument construction for multicalls from xen_flush_tlb_others. The args should only need space for the actual number of cpus. Also in 2.6.39 it exposes a bootup problem. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] set_cpu_sibling_map+0x123/0x30d ... Call Trace: [] ? xen_restore_fl_direct_reloc+0x4/0x4 [] xen_smp_prepare_cpus+0x36/0x135 .. CC: stable@kernel.org Signed-off-by: Andrew Jones [v2: Updated to compile on 3.0] [v3: Updated to compile when CONFIG_SMP is not defined] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 3 ++- arch/x86/xen/smp.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index afe1d54..673e968 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -1231,7 +1232,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, { struct { struct mmuext_op op; - DECLARE_BITMAP(mask, NR_CPUS); + DECLARE_BITMAP(mask, num_processors); } *args; struct multicall_space mcs; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 41038c0..b4533a8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -205,11 +205,18 @@ static void __init xen_smp_prepare_boot_cpu(void) static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; + unsigned int i; xen_init_lock_cpu(0); smp_store_cpu_info(0); cpu_data(0).x86_max_cores = 1; + + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + } set_cpu_sibling_map(0); if (xen_smp_intr_init(0)) -- cgit v1.1 From b2abe50688dcb470e2e46109da7e7e02245ed59b Mon Sep 17 00:00:00 2001 From: Tom Goetz Date: Mon, 16 May 2011 15:06:26 -0400 Subject: xen: When calling power_off, don't call the halt function. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .. As it won't actually power off the machine. Reported-by: Sven Köhler Tested-by: Sven Köhler Signed-off-by: Tom Goetz Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dd7b88f..5525163 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1033,6 +1033,13 @@ static void xen_machine_halt(void) xen_reboot(SHUTDOWN_poweroff); } +static void xen_machine_power_off(void) +{ + if (pm_power_off) + pm_power_off(); + xen_reboot(SHUTDOWN_poweroff); +} + static void xen_crash_shutdown(struct pt_regs *regs) { xen_reboot(SHUTDOWN_crash); @@ -1058,7 +1065,7 @@ int xen_panic_handler_init(void) static const struct machine_ops xen_machine_ops __initconst = { .restart = xen_restart, .halt = xen_machine_halt, - .power_off = xen_machine_halt, + .power_off = xen_machine_power_off, .shutdown = xen_machine_halt, .crash_shutdown = xen_crash_shutdown, .emergency_restart = xen_emergency_restart, -- cgit v1.1 From acd049c6e99d2ad1195666195230f6881d1c1588 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 16 Jun 2011 13:07:19 -0400 Subject: xen/setup: Fix for incorrect xen_extra_mem_start. The earlier attempts (24bdb0b62cc82120924762ae6bc85afc8c3f2b26) at fixing this problem caused other problems to surface (PV guests with no PCI passthrough would have SWIOTLB turned on - which meant 64MB of precious contingous DMA32 memory being eaten up per guest). The problem was: "on xen we add an extra memory region at the end of the e820, and on this particular machine this extra memory region would start below 4g and cross over the 4g boundary: [0xfee01000-0x192655000) Unfortunately e820_end_of_low_ram_pfn does not expect an e820 layout like that so it returns 4g, therefore initial_memory_mapping will map [0 - 0x100000000), that is a memory range that includes some reserved memory regions." The memory range was the IOAPIC regions, and with the 1-1 mapping turned on, it would map them as RAM, not as MMIO regions. This caused the hypervisor to complain. Fortunately this is experienced only under the initial domain so we guard for it. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index be1a464..60aeeb5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -227,11 +227,7 @@ char * __init xen_memory_setup(void) memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; -#ifdef CONFIG_X86_32 xen_extra_mem_start = mem_end; -#else - xen_extra_mem_start = max((1ULL << 32), mem_end); -#endif for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; @@ -266,6 +262,12 @@ char * __init xen_memory_setup(void) if (map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); } + /* Align the balloon area so that max_low_pfn does not get set + * to be at the _end_ of the PCI gap at the far end (fee01000). + * Note that xen_extra_mem_start gets set in the loop above to be + * past the last E820 region. */ + if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) + xen_extra_mem_start = (1ULL<<32); /* * In domU, the ISA region is normal, usable memory, but we -- cgit v1.1 From 7d68dc3f1003a38948c55c803c32d1989dd49198 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Tue, 14 Jun 2011 19:53:09 +0200 Subject: x86, efi: Do not reserve boot services regions within reserved areas Commit 916f676f8dc started reserving boot service code since some systems require you to keep that code around until SetVirtualAddressMap is called. However, in some cases those areas will overlap with reserved regions. The proper medium-term fix is to fix the bootloader to prevent the conflicts from occurring by moving the kernel to a better position, but the kernel should check for this possibility, and only reserve regions which can be reserved. Signed-off-by: Maarten Lankhorst Link: http://lkml.kernel.org/r/4DF7A005.1050407@gmail.com Acked-by: Matthew Garrett Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/include/asm/memblock.h | 2 +- arch/x86/mm/memblock.c | 4 ++-- arch/x86/platform/efi/efi.c | 29 +++++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h index 19ae14b..0cd3800 100644 --- a/arch/x86/include/asm/memblock.h +++ b/arch/x86/include/asm/memblock.h @@ -4,7 +4,6 @@ #define ARCH_DISCARD_MEMBLOCK u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align); -void memblock_x86_to_bootmem(u64 start, u64 end); void memblock_x86_reserve_range(u64 start, u64 end, char *name); void memblock_x86_free_range(u64 start, u64 end); @@ -19,5 +18,6 @@ u64 memblock_x86_hole_size(u64 start, u64 end); u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align); u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit); u64 memblock_x86_memory_in_range(u64 addr, u64 limit); +bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align); #endif diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c index aa11693..992da5e 100644 --- a/arch/x86/mm/memblock.c +++ b/arch/x86/mm/memblock.c @@ -8,7 +8,7 @@ #include /* Check for already reserved areas */ -static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align) +bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align) { struct memblock_region *r; u64 addr = *addrp, last; @@ -59,7 +59,7 @@ u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align) if (addr >= ei_last) continue; *sizep = ei_last - addr; - while (check_with_memblock_reserved_size(&addr, sizep, align)) + while (memblock_x86_check_reserved_size(&addr, sizep, align)) ; if (*sizep) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 0d3a4fa..474356b 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -310,14 +310,31 @@ void __init efi_reserve_boot_services(void) for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { efi_memory_desc_t *md = p; - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + u64 start = md->phys_addr; + u64 size = md->num_pages << EFI_PAGE_SHIFT; if (md->type != EFI_BOOT_SERVICES_CODE && md->type != EFI_BOOT_SERVICES_DATA) continue; - - memblock_x86_reserve_range(start, start + size, "EFI Boot"); + /* Only reserve where possible: + * - Not within any already allocated areas + * - Not over any memory area (really needed, if above?) + * - Not within any part of the kernel + * - Not the bios reserved area + */ + if ((start+size >= virt_to_phys(_text) + && start <= virt_to_phys(_end)) || + !e820_all_mapped(start, start+size, E820_RAM) || + memblock_x86_check_reserved_size(&start, &size, + 1<num_pages = 0; + memblock_dbg(PFX "Could not reserve boot range " + "[0x%010llx-0x%010llx]\n", + start, start+size-1); + } else + memblock_x86_reserve_range(start, start+size, + "EFI Boot"); } } @@ -334,6 +351,10 @@ static void __init efi_free_boot_services(void) md->type != EFI_BOOT_SERVICES_DATA) continue; + /* Could not reserve boot area */ + if (!size) + continue; + free_bootmem_late(start, size); } } -- cgit v1.1 From b72336355bb4c92d4a2be3f975dbea47089c83c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 May 2011 22:11:17 +0200 Subject: KVM: MMU: Fix build warnings in walk_addr_generic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 3.0-rc1 I get In file included from arch/x86/kvm/mmu.c:2856: arch/x86/kvm/paging_tmpl.h: In function ‘paging32_walk_addr_generic’: arch/x86/kvm/paging_tmpl.h:124: warning: ‘ptep_user’ may be used uninitialized in this function In file included from arch/x86/kvm/mmu.c:2852: arch/x86/kvm/paging_tmpl.h: In function ‘paging64_walk_addr_generic’: arch/x86/kvm/paging_tmpl.h:124: warning: ‘ptep_user’ may be used uninitialized in this function caused by 6e2ca7d1802bf8ed9908435e34daa116662e7790. According to Takuya Yoshikawa, ptep_user won't be used uninitialized so shut up gcc. Cc: Takuya Yoshikawa Link: http://lkml.kernel.org/r/20110530094604.GC21833@liondog.tnic Signed-off-by: Borislav Petkov Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6c4dc010..9d03ad4 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -121,7 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gva_t addr, u32 access) { pt_element_t pte; - pt_element_t __user *ptep_user; + pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; -- cgit v1.1 From 5233dd51ece1615d54ab96c4cbe9ac3cc595e955 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 6 Jun 2011 14:27:47 -0300 Subject: KVM: VMX: do not overwrite uptodate vcpu->arch.cr3 on KVM_SET_SREGS Only decache guest CR3 value if vcpu->arch.cr3 is stale. Fixes loadvm with live guest. Signed-off-by: Marcelo Tosatti Tested-by: Markus Schade Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4c3fa0f..d48ec60 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2047,7 +2047,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, unsigned long cr0, struct kvm_vcpu *vcpu) { - vmx_decache_cr3(vcpu); + if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + vmx_decache_cr3(vcpu); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, -- cgit v1.1 From a0a8eaba1661232f094654422bdabe2df4e26863 Mon Sep 17 00:00:00 2001 From: Steve Date: Fri, 17 Jun 2011 10:25:39 +0800 Subject: KVM: MMU: fix opposite condition in mapping_level_dirty_bitmap The condition is opposite, it always maps huge page for the dirty tracked page Reported-by: Steve Signed-off-by: Steve Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bd14bb4..aee3862 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -565,7 +565,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) { - return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); + return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) -- cgit v1.1 From de2d1a524e94a79078d9fe22c57c0c6009237547 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 15 Jun 2011 20:50:04 -0700 Subject: KVM: Fix register corruption in pvclock_scale_delta The 128-bit multiply in pvclock.h was missing an output constraint for EDX which caused a register corruption to appear. Thanks to Ulrich for diagnosing the EDX corruption and Avi for providing this fix. Signed-off-by: Zachary Amsden Signed-off-by: Avi Kivity --- arch/x86/include/asm/pvclock.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 31d84ac..a518c0a 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -22,6 +22,8 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) u64 product; #ifdef __i386__ u32 tmp1, tmp2; +#else + ulong tmp; #endif if (shift < 0) @@ -42,8 +44,11 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif defined(__x86_64__) __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); + "mul %[mul_frac] ; shrd $32, %[hi], %[lo]" + : [lo]"=a"(product), + [hi]"=d"(tmp) + : "0"(delta), + [mul_frac]"rm"((u64)mul_frac)); #else #error implement me! #endif -- cgit v1.1 From c6830c22603aaecf65405af23f6da2d55892f9cb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 16 Jun 2011 17:28:07 +0900 Subject: Fix node_start/end_pfn() definition for mm/page_cgroup.c commit 21a3c96 uses node_start/end_pfn(nid) for detection start/end of nodes. But, it's not defined in linux/mmzone.h but defined in /arch/???/include/mmzone.h which is included only under CONFIG_NEED_MULTIPLE_NODES=y. Then, we see mm/page_cgroup.c: In function 'page_cgroup_init': mm/page_cgroup.c:308: error: implicit declaration of function 'node_start_pfn' mm/page_cgroup.c:309: error: implicit declaration of function 'node_end_pfn' So, fixiing page_cgroup.c is an idea... But node_start_pfn()/node_end_pfn() is a very generic macro and should be implemented in the same manner for all archs. (m32r has different implementation...) This patch removes definitions of node_start/end_pfn() in each archs and defines a unified one in linux/mmzone.h. It's not under CONFIG_NEED_MULTIPLE_NODES, now. A result of macro expansion is here (mm/page_cgroup.c) for !NUMA start_pfn = ((&contig_page_data)->node_start_pfn); end_pfn = ({ pg_data_t *__pgdat = (&contig_page_data); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;}); for NUMA (x86-64) start_pfn = ((node_data[nid])->node_start_pfn); end_pfn = ({ pg_data_t *__pgdat = (node_data[nid]); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;}); Changelog: - fixed to avoid using "nid" twice in node_end_pfn() macro. Reported-and-acked-by: Randy Dunlap Reported-and-tested-by: Ingo Molnar Acked-by: Mel Gorman Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Linus Torvalds --- arch/x86/include/asm/mmzone_32.h | 11 ----------- arch/x86/include/asm/mmzone_64.h | 3 --- 2 files changed, 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 5e83a41..224e8c5 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -48,17 +48,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -/* - * Following are macros that each numa implmentation must define. - */ - -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) \ -({ \ - pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ -}) - static inline int pfn_valid(int pfn) { int nid = pfn_to_nid(pfn); diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index b3f88d7..129d9aa 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -13,8 +13,5 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ - NODE_DATA(nid)->node_spanned_pages) #endif #endif /* _ASM_X86_MMZONE_64_H */ -- cgit v1.1 From e376fd664b1547e29e264e3cfb97553a1be9054b Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 26 May 2011 11:12:53 +0200 Subject: watchdog: Intel SCU Watchdog: Fix build and remove duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to build the Intel SCU Watchdog fails for me with gcc 4.6.0 - $ gcc --version | head -n 1 gcc (GCC) 4.6.0 20110513 (prerelease) like this : CC drivers/watchdog/intel_scu_watchdog.o In file included from drivers/watchdog/intel_scu_watchdog.c:49:0: /home/jj/src/linux-2.6/arch/x86/include/asm/apb_timer.h: In function ‘apbt_time_init’: /home/jj/src/linux-2.6/arch/x86/include/asm/apb_timer.h:65:42: warning: ‘return’ with a value, in function returning void [enabled by default] drivers/watchdog/intel_scu_watchdog.c: In function ‘intel_scu_watchdog_init’: drivers/watchdog/intel_scu_watchdog.c:468:2: error: implicit declaration of function ‘sfi_get_mtmr’ [-Werror=implicit-function-declaration] drivers/watchdog/intel_scu_watchdog.c:468:32: warning: assignment makes pointer from integer without a cast [enabled by default] cc1: some warnings being treated as errors make[1]: *** [drivers/watchdog/intel_scu_watchdog.o] Error 1 make: *** [drivers/watchdog/intel_scu_watchdog.o] Error 2 Additionally, linux/types.h is needlessly being included twice in drivers/watchdog/intel_scu_watchdog.c Signed-off-by: Jesper Juhl Signed-off-by: Wim Van Sebroeck --- arch/x86/include/asm/apb_timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index 2fefa50..af60d8a 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -62,7 +62,7 @@ extern int sfi_mtimer_num; #else /* CONFIG_APB_TIMER */ static inline unsigned long apbt_quick_calibrate(void) {return 0; } -static inline void apbt_time_init(void) {return 0; } +static inline void apbt_time_init(void) { } #endif #endif /* ASM_X86_APBT_H */ -- cgit v1.1 From cb16c348760ad2bc79b67b20aefac05529569ed7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 19 Jun 2011 19:21:11 +0300 Subject: KVM: x86 emulator: fix %rip-relative addressing with immediate source operand %rip-relative addressing is relative to the first byte of the next instruction, so we need to add %rip only after we've fetched any immediate bytes. Based on original patch by Li Xin . Signed-off-by: Avi Kivity Acked-by: Li Xin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6df88c7..adc9867 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3372,7 +3372,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; struct opcode opcode; - struct operand memop = { .type = OP_NONE }; + struct operand memop = { .type = OP_NONE }, *memopp = NULL; c->eip = ctxt->eip; c->fetch.start = c->eip; @@ -3547,9 +3547,6 @@ done_prefixes: if (memop.type == OP_MEM && c->ad_bytes != 8) memop.addr.mem.ea = (u32)memop.addr.mem.ea; - if (memop.type == OP_MEM && c->rip_relative) - memop.addr.mem.ea += c->eip; - /* * Decode and fetch the source operand: register, memory * or immediate. @@ -3571,6 +3568,7 @@ done_prefixes: c->op_bytes; srcmem_common: c->src = memop; + memopp = &c->src; break; case SrcImmU16: rc = decode_imm(ctxt, &c->src, 2, false); @@ -3667,6 +3665,7 @@ done_prefixes: case DstMem: case DstMem64: c->dst = memop; + memopp = &c->dst; if ((c->d & DstMask) == DstMem64) c->dst.bytes = 8; else @@ -3700,10 +3699,13 @@ done_prefixes: /* Special instructions do their own operand decoding. */ default: c->dst.type = OP_NONE; /* Disable writeback. */ - return 0; + break; } done: + if (memopp && memopp->type == OP_MEM && c->rip_relative) + memopp->addr.mem.ea += c->eip; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } -- cgit v1.1 From 32dd11942aeb47f91209a446d6b10063c5b69389 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 30 Jun 2011 09:12:40 -0400 Subject: xen/mmu: Fix for linker errors when CONFIG_SMP is not defined. Simple enough - we use an extern defined symbol which is not defined when CONFIG_SMP is not defined. This fixes the linker dying. CC: stable@kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 673e968..0ccccb6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1232,7 +1232,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, { struct { struct mmuext_op op; +#ifdef CONFIG_SMP DECLARE_BITMAP(mask, num_processors); +#else + DECLARE_BITMAP(mask, NR_CPUS); +#endif } *args; struct multicall_space mcs; -- cgit v1.1 From 155a16f21923bc2f04161ac92acca986371ef27b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 30 Jun 2011 09:18:27 -0400 Subject: xen/pci: Use the INT_SRC_OVR IRQ (instead of GSI) to preset the ACPI SCI IRQ. In the past we would use the GSI value to preset the ACPI SCI IRQ which worked great as GSI == IRQ: ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level) While that is most often seen, there are some oddities: ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level) which means that GSI 20 (or pin 20) is to be overriden for IRQ 9. Our code that presets the interrupt for ACPI SCI however would use the GSI 20 instead of IRQ 9 ending up with: xen: sci override: global_irq=20 trigger=0 polarity=1 xen: registering gsi 20 triggering 0 polarity 1 xen: --> pirq=20 -> irq=20 xen: acpi sci 20 .. snip.. calling acpi_init+0x0/0xbc @ 1 ACPI: SCI (IRQ9) allocation failed ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20110413/evevent-119) ACPI: Unable to start the ACPI Interpreter as the ACPI interpreter made a call to 'acpi_gsi_to_irq' which got nine. It used that value to request an IRQ (request_irq) and since that was not present it failed. The fix is to recognize that for interrupts that are overriden (in our case we only care about the ACPI SCI) we should use the IRQ number to present the IRQ instead of the using GSI. End result is that we get: xen: sci override: global_irq=20 trigger=0 polarity=1 xen: registering gsi 20 triggering 0 polarity 1 xen: --> pirq=20 -> irq=9 (gsi=9) xen: acpi sci 9 which fixes the ACPI interpreter failing on startup. CC: stable@kernel.org Reported-by: Liwei Tested-by: Liwei [http://lists.xensource.com/archives/html/xen-devel/2011-06/msg01727.html] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 8214724..fe00830 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -333,6 +333,7 @@ static int xen_register_pirq(u32 gsi, int triggering) struct physdev_map_pirq map_irq; int shareable = 0; char *name; + bool gsi_override = false; if (!xen_pv_domain()) return -1; @@ -349,11 +350,32 @@ static int xen_register_pirq(u32 gsi, int triggering) if (pirq < 0) goto out; - irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); + /* Before we bind the GSI to a Linux IRQ, check whether + * we need to override it with bus_irq (IRQ) value. Usually for + * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so: + * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level) + * but there are oddballs where the IRQ != GSI: + * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level) + * which ends up being: gsi_to_irq[9] == 20 + * (which is what acpi_gsi_to_irq ends up calling when starting the + * the ACPI interpreter and keels over since IRQ 9 has not been + * setup as we had setup IRQ 20 for it). + */ + if (gsi == acpi_sci_override_gsi) { + /* Check whether the GSI != IRQ */ + acpi_gsi_to_irq(gsi, &irq); + if (irq != gsi) + /* Bugger, we MUST have that IRQ. */ + gsi_override = true; + } + if (gsi_override) + irq = xen_bind_pirq_gsi_to_irq(irq, pirq, shareable, name); + else + irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); if (irq < 0) goto out; - printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq); + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi); map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; -- cgit v1.1 From a26474e8649643e82d71e3a386d5c4bcc0b207ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Jun 2011 11:41:07 +0200 Subject: x86-32, NUMA: Fix boot regression caused by NUMA init unification on highmem machines During 32/64 NUMA init unification, commit 797390d855 ("x86-32, NUMA: use sparse_memory_present_with_active_regions()") made 32bit mm init call memory_present() automatically from active_regions instead of leaving it to each NUMA init path. This commit description is inaccurate - memory_present() calls aren't the same for flat and numaq. After the commit, memory_present() is only called for the intersection of e820 and NUMA layout. Before, on flatmem, memory_present() would be called from 0 to max_pfn. After, it would be called only on the areas that e820 indicates to be populated. This is how x86_64 works and should be okay as memmap is allowed to contain holes; however, x86_32 DISCONTIGMEM is missing early_pfn_valid(), which makes memmap_init_zone() assume that memmap doesn't contain any hole. This leads to the following oops if e820 map contains holes as it often does on machine with near or more 4GiB of memory by calling pfn_to_page() on a pfn which isn't mapped to a NUMA node, a reported by Conny Seidel: BUG: unable to handle kernel paging request at 000012b0 IP: [] memmap_init_zone+0x6c/0xf2 *pdpt =3D 0000000000000000 *pde =3D f000eef3f000ee00 Oops: 0000 [#1] SMP last sysfs file: Modules linked in: Pid: 0, comm: swapper Not tainted 2.6.39-rc5-00164-g797390d #1 To Be Filled By O.E.M. To Be Filled By O.E.M./E350M1 EIP: 0060:[] EFLAGS: 00010012 CPU: 0 EIP is at memmap_init_zone+0x6c/0xf2 EAX: 00000000 EBX: 000a8000 ECX: 000a7fff EDX: f2c00b80 ESI: 000a8000 EDI: f2c00800 EBP: c19ffe54 ESP: c19ffe34 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 0, ti=3Dc19fe000 task=3Dc1a07f60 task.ti=3Dc19fe000) Stack: 00000002 00000000 0023f000 00000000 10000000 00000a00 f2c00000 f2c00b58 c19ffeb0 c1a80f24 000375fe 00000000 f2c00800 00000800 00000100 00000030 c1abb768 0000003c 00000000 00000000 00000004 00207a02 f2c00800 000375fe Call Trace: [] free_area_init_node+0x358/0x385 [] free_area_init_nodes+0x420/0x487 [] paging_init+0x114/0x11b [] setup_arch+0xb37/0xc0a [] start_kernel+0x76/0x316 [] i386_start_kernel+0xa8/0xb0 This patch fixes the bug by defining early_pfn_valid() to be the same as pfn_valid() when DISCONTIGMEM. Reported-bisected-and-tested-by: Conny Seidel Signed-off-by: Tejun Heo Cc: hans.rosenfeld@amd.com Cc: Christoph Lameter Cc: Conny Seidel Link: http://lkml.kernel.org/r/20110628094107.GB3386@htj.dyndns.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmzone_32.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 224e8c5..ffa037f 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -57,6 +57,8 @@ static inline int pfn_valid(int pfn) return 0; } +#define early_pfn_valid(pfn) pfn_valid((pfn)) + #endif /* CONFIG_DISCONTIGMEM */ #ifdef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.1 From b49c78d4827be8d7e67e5b94adac6b30a4a9ad14 Mon Sep 17 00:00:00 2001 From: Peter Chubb Date: Wed, 6 Jul 2011 10:56:30 +1000 Subject: x86, reboot: Acer Aspire One A110 reboot quirk Since git commit 660e34cebf0a11d54f2d5dd8838607452355f321 x86: reorder reboot method preferences, my Acer Aspire One hangs on reboot. It appears that its ACPI method for rebooting is broken. The attached patch adds a quirk so that the machine will reboot via the BIOS. [ hpa: verified that the ACPI control on this machine is just plain broken. ] Signed-off-by: Peter Chubb Link: http://lkml.kernel.org/r/w439iki5vl.wl%25peter@chubb.wattle.id.au Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0c016f7..4f0d46f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -294,6 +294,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), }, }, + { /* Handle reboot issue on Acer Aspire one */ + .callback = set_bios_reboot, + .ident = "Acer Aspire One A110", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), + }, + }, { } }; -- cgit v1.1 From 7a3136666bc0f0419f7aaa7b1fabb4b0e0a7fb76 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 Jul 2011 18:10:34 -0700 Subject: x86, suspend: Restore MISC_ENABLE MSR in realmode wakeup Some BIOSes will reset the Intel MISC_ENABLE MSR (specifically the XD_DISABLE bit) when resuming from S3, which can interact poorly with ebba638ae723d8a8fc2f7abce5ec18b688b791d7. In 32bit PAE mode, this can lead to a fault when EFER is restored by the kernel wakeup routines, due to it setting the NX bit for a CPU that (thanks to the BIOS reset) now incorrectly thinks it lacks the NX feature. (64bit is not affected because it uses a common CPU bring-up that specifically handles the XD_DISABLE bit.) The need for MISC_ENABLE being restored so early is specific to the S3 resume path. Normally, MISC_ENABLE is saved in save_processor_state(), but this happens after the resume header is created, so just reproduce the logic here. (acpi_suspend_lowlevel() creates the header, calls do_suspend_lowlevel, which calls save_processor_state(), so the saved processor context isn't available during resume header creation.) [ hpa: Consider for stable if OK in mainline ] Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20110707011034.GA8523@outflux.net Signed-off-by: H. Peter Anvin Cc: Rafael J. Wysocki Cc: 2.6.38+ --- arch/x86/kernel/acpi/realmode/wakeup.S | 14 ++++++++++++++ arch/x86/kernel/acpi/realmode/wakeup.h | 6 ++++++ arch/x86/kernel/acpi/sleep.c | 6 ++++++ 3 files changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index ead21b6..b4fd836 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -28,6 +28,8 @@ pmode_cr3: .long 0 /* Saved %cr3 */ pmode_cr4: .long 0 /* Saved %cr4 */ pmode_efer: .quad 0 /* Saved EFER */ pmode_gdt: .quad 0 +pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ +pmode_behavior: .long 0 /* Wakeup behavior flags */ realmode_flags: .long 0 real_magic: .long 0 trampoline_segment: .word 0 @@ -91,6 +93,18 @@ wakeup_code: /* Call the C code */ calll main + /* Restore MISC_ENABLE before entering protected mode, in case + BIOS decided to clear XD_DISABLE during S3. */ + movl pmode_behavior, %eax + btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax + jnc 1f + + movl pmode_misc_en, %eax + movl pmode_misc_en + 4, %edx + movl $MSR_IA32_MISC_ENABLE, %ecx + wrmsr +1: + /* Do any other stuff... */ #ifndef CONFIG_64BIT diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h index e1828c0..97a29e1 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ b/arch/x86/kernel/acpi/realmode/wakeup.h @@ -21,6 +21,9 @@ struct wakeup_header { u32 pmode_efer_low; /* Protected mode EFER */ u32 pmode_efer_high; u64 pmode_gdt; + u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ + u32 pmode_misc_en_high; + u32 pmode_behavior; /* Wakeup routine behavior flags */ u32 realmode_flags; u32 real_magic; u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ @@ -39,4 +42,7 @@ extern struct wakeup_header wakeup_header; #define WAKEUP_HEADER_SIGNATURE 0x51ee1111 #define WAKEUP_END_SIGNATURE 0x65a22c82 +/* Wakeup behavior bits */ +#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 + #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 18a857b..103b6ab 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -77,6 +77,12 @@ int acpi_suspend_lowlevel(void) header->pmode_cr0 = read_cr0(); header->pmode_cr4 = read_cr4_safe(); + header->pmode_behavior = 0; + if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, + &header->pmode_misc_en_low, + &header->pmode_misc_en_high)) + header->pmode_behavior |= + (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE); header->realmode_flags = acpi_realmode_flags; header->real_magic = 0x12345678; -- cgit v1.1 From f70e957cda22d309c769805cbb932407a5232219 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Wed, 6 Jul 2011 16:52:37 -0400 Subject: x86: Don't use the EFI reboot method by default Testing suggests that at least some Lenovos and some Intels will fail to reboot via EFI, attempting to jump to an unmapped physical address. In the long run we could handle this by providing a page table with a 1:1 mapping of physical addresses, but for now it's probably just easier to assume that ACPI or legacy methods will be present and reboot via those. Signed-off-by: Matthew Garrett Cc: Linus Torvalds Cc: Andrew Morton Cc: Alan Cox Link: http://lkml.kernel.org/r/1309985557-15350-1-git-send-email-mjg@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 474356b..899e393 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -504,9 +504,6 @@ void __init efi_init(void) x86_platform.set_wallclock = efi_set_rtc_mmss; #endif - /* Setup for EFI runtime service */ - reboot_type = BOOT_EFI; - #if EFI_DEBUG print_efi_memmap(); #endif -- cgit v1.1 From ee339fe63ac408e4604c1c88b1f9a428f2511b70 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 6 Jul 2011 09:43:16 -0400 Subject: xen/pci: Move check for acpi_sci_override_gsi to xen_setup_acpi_sci. Previously we would check for acpi_sci_override_gsi == gsi every time a PCI device was enabled. That works during early bootup, but later on it could lead to triggering unnecessarily the acpi_gsi_to_irq(..) lookup. The reason is that acpi_sci_override_gsi was declared in __initdata and after early bootup could contain bogus values. This patch moves the check for acpi_sci_override_gsi to the site where the ACPI SCI is preset. CC: stable@kernel.org Reported-by: Raghavendra D Prabhu Tested-by: Raghavendra D Prabhu [http://lists.xensource.com/archives/html/xen-devel/2011-07/msg00154.html] Suggested-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 56 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index fe00830..f567965 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -327,13 +327,12 @@ int __init pci_xen_hvm_init(void) } #ifdef CONFIG_XEN_DOM0 -static int xen_register_pirq(u32 gsi, int triggering) +static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) { int rc, pirq, irq = -1; struct physdev_map_pirq map_irq; int shareable = 0; char *name; - bool gsi_override = false; if (!xen_pv_domain()) return -1; @@ -345,31 +344,12 @@ static int xen_register_pirq(u32 gsi, int triggering) shareable = 1; name = "ioapic-level"; } - pirq = xen_allocate_pirq_gsi(gsi); if (pirq < 0) goto out; - /* Before we bind the GSI to a Linux IRQ, check whether - * we need to override it with bus_irq (IRQ) value. Usually for - * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so: - * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level) - * but there are oddballs where the IRQ != GSI: - * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level) - * which ends up being: gsi_to_irq[9] == 20 - * (which is what acpi_gsi_to_irq ends up calling when starting the - * the ACPI interpreter and keels over since IRQ 9 has not been - * setup as we had setup IRQ 20 for it). - */ - if (gsi == acpi_sci_override_gsi) { - /* Check whether the GSI != IRQ */ - acpi_gsi_to_irq(gsi, &irq); - if (irq != gsi) - /* Bugger, we MUST have that IRQ. */ - gsi_override = true; - } - if (gsi_override) - irq = xen_bind_pirq_gsi_to_irq(irq, pirq, shareable, name); + if (gsi_override >= 0) + irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name); else irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); if (irq < 0) @@ -392,7 +372,7 @@ out: return irq; } -static int xen_register_gsi(u32 gsi, int triggering, int polarity) +static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) { int rc, irq; struct physdev_setup_gsi setup_gsi; @@ -403,7 +383,7 @@ static int xen_register_gsi(u32 gsi, int triggering, int polarity) printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", gsi, triggering, polarity); - irq = xen_register_pirq(gsi, triggering); + irq = xen_register_pirq(gsi, gsi_override, triggering); setup_gsi.gsi = gsi; setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); @@ -425,6 +405,8 @@ static __init void xen_setup_acpi_sci(void) int rc; int trigger, polarity; int gsi = acpi_sci_override_gsi; + int irq = -1; + int gsi_override = -1; if (!gsi) return; @@ -441,7 +423,25 @@ static __init void xen_setup_acpi_sci(void) printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d " "polarity=%d\n", gsi, trigger, polarity); - gsi = xen_register_gsi(gsi, trigger, polarity); + /* Before we bind the GSI to a Linux IRQ, check whether + * we need to override it with bus_irq (IRQ) value. Usually for + * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so: + * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level) + * but there are oddballs where the IRQ != GSI: + * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level) + * which ends up being: gsi_to_irq[9] == 20 + * (which is what acpi_gsi_to_irq ends up calling when starting the + * the ACPI interpreter and keels over since IRQ 9 has not been + * setup as we had setup IRQ 20 for it). + */ + /* Check whether the GSI != IRQ */ + if (acpi_gsi_to_irq(gsi, &irq) == 0) { + if (irq >= 0 && irq != gsi) + /* Bugger, we MUST have that IRQ. */ + gsi_override = irq; + } + + gsi = xen_register_gsi(gsi, gsi_override, trigger, polarity); printk(KERN_INFO "xen: acpi sci %d\n", gsi); return; @@ -450,7 +450,7 @@ static __init void xen_setup_acpi_sci(void) static int acpi_register_gsi_xen(struct device *dev, u32 gsi, int trigger, int polarity) { - return xen_register_gsi(gsi, trigger, polarity); + return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); } static int __init pci_xen_initial_domain(void) @@ -489,7 +489,7 @@ void __init xen_setup_pirqs(void) if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) continue; - xen_register_pirq(irq, + xen_register_pirq(irq, -1 /* no GSI override */, trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); } } -- cgit v1.1