From 36e9e1eab777e077f7484d309ff676d0568e27d1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 15 Mar 2010 14:33:06 -0800 Subject: x86: Handle legacy PIC interrupts on all the cpu's Ingo Molnar reported that with the recent changes of not statically blocking IRQ0_VECTOR..IRQ15_VECTOR's on all the cpu's, broke an AMD platform (with Nvidia chipset) boot when "noapic" boot option is used. On this platform, legacy PIC interrupts are getting delivered to all the cpu's instead of just the boot cpu. Thus not initializing the vector to irq mapping for the legacy irq's resulted in not handling certain interrupts causing boot hang. Fix this by initializing the vector to irq mapping on all the logical cpu's, if the legacy IRQ is handled by the legacy PIC. Reported-by: Ingo Molnar Signed-off-by: Suresh Siddha [ -v2: io-apic-enabled improvement ] Acked-by: Yinghai Lu Cc: Eric W. Biederman LKML-Reference: <1268692386.3296.43.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 1 + arch/x86/kernel/apic/io_apic.c | 8 ++++++++ arch/x86/kernel/irqinit.c | 22 ++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- 4 files changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a929c9e..46c0fe0 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -133,6 +133,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); +extern void setup_vector_irq(int cpu); #ifdef CONFIG_X86_IO_APIC extern void lock_vector_lock(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e4e0ddc..463de9a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1268,6 +1268,14 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; + + /* + * If it is a legacy IRQ handled by the legacy PIC, this cpu + * will be part of the irq_cfg's domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) + cpumask_set_cpu(cpu, cfg->domain); + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index ef257fc..f01d390 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -141,6 +141,28 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ +#ifndef CONFIG_X86_IO_APIC + int irq; + + /* + * On most of the platforms, legacy PIC delivers the interrupts on the + * boot cpu. But there are certain platforms where PIC interrupts are + * delivered to multiple cpu's. If the legacy IRQ is handled by the + * legacy PIC, for the new cpu that is coming online, setup the static + * legacy vector to irq mapping: + */ + for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; +#endif + + __setup_vector_irq(cpu); +} + static void __init smp_intr_init(void) { #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a02e80c..06d98ae 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -247,7 +247,7 @@ static void __cpuinit smp_callin(void) /* * Need to setup vector mappings before we enable interrupts. */ - __setup_vector_irq(smp_processor_id()); + setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * -- cgit v1.1 From ff30a0543e9a6cd732582063e7cae951cdb7acf2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Mar 2010 10:11:15 +0000 Subject: x86: Fix placement of FIX_OHCI1394_BASE Ever for 32-bit with sufficiently high NR_CPUS, and starting with commit 789d03f584484af85dbdc64935270c8e45f36ef7 also for 64-bit, the statically allocated early fixmap page tables were not covering FIX_OHCI1394_BASE, leading to a boot time crash when "ohci1394_dma=early" was used. Despite this entry not being a permanently used one, it needs to be moved into the permanent range since it has to be close to FIX_DBGP_BASE and FIX_EARLYCON_MEM_BASE. Reported-bisected-and-tested-by: Justin P. Mattock Fixes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=14487 Signed-off-by: Jan Beulich Cc: # [as far back as long as it still applies] LKML-Reference: <4B9E15D30200007800034D23@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 635f03b..d07b44f 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -82,6 +82,9 @@ enum fixed_addresses { #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -132,9 +135,6 @@ enum fixed_addresses { (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) : __end_of_permanent_fixed_addresses, FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif -- cgit v1.1 From 035a02c1e1de31888e8b6adac0ff667971ac04db Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Mar 2010 12:09:22 +0100 Subject: x86, amd: Restrict usage of c1e_idle() Currently c1e_idle returns true for all CPUs greater than or equal to family 0xf model 0x40. This covers too many CPUs. Meanwhile a respective erratum for the underlying problem was filed (#400). This patch adds the logic to check whether erratum #400 applies to a given CPU. Especially for CPUs where SMI/HW triggered C1e is not supported, c1e_idle() doesn't need to be used. We can check this by looking at the respective OSVW bit for erratum #400. Cc: # .32.x .33.x Signed-off-by: Andreas Herrmann LKML-Reference: <20100319110922.GA19614@alberich.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/process.c | 32 ++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1cd58cd..4604e6a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -105,6 +105,8 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ad95406..28ad9f4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -526,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) } /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use) */ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) { + u64 val; if (c->x86_vendor != X86_VENDOR_AMD) - return 0; - - if (c->x86 < 0x0F) - return 0; + goto no_c1e_idle; /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; + if (c->x86 == 0x0F && c->x86_model >= 0x40) + return 1; - return 1; + if (c->x86 == 0x10) { + /* + * check OSVW bit for CPUs that are not affected + * by erratum #400 + */ + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } + return 1; + } + +no_c1e_idle: + return 0; } static cpumask_var_t c1e_mask; -- cgit v1.1 From a90110c61073eab95d1986322693c2b9a8a6a5f6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 21 Mar 2010 21:51:51 +0100 Subject: x86 / perf: Fix suspend to RAM on HP nx6325 Commit 3f6da3905398826d85731247e7fbcf53400c18bd (perf: Rework and fix the arch CPU-hotplug hooks) broke suspend to RAM on my HP nx6325 (and most likely on other AMD-based boxes too) by allowing amd_pmu_cpu_offline() to be executed for CPUs that are going offline as part of the suspend process. The problem is that cpuhw->amd_nb may be NULL already, so the function should make sure it's not NULL before accessing the object pointed to by it. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_amd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 573458f..b87e0b6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -348,10 +348,12 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_lock(&amd_nb_lock); - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); + if (cpuhw->amd_nb) { + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); - cpuhw->amd_nb = NULL; + cpuhw->amd_nb = NULL; + } raw_spin_unlock(&amd_nb_lock); } -- cgit v1.1 From c9c9b564717e5b6b2ae8b770da1c73a348c84cce Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 22 Mar 2010 16:34:10 -0600 Subject: x86/PCI: remove redundant warnings pci_claim_resource() already prints more detailed error messages, so these are really redundant. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index dece3eb..46fd43f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -127,9 +127,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve window %pR\n", - r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -181,8 +178,6 @@ static void __init pcibios_allocate_resources(int pass) "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve %pR\n", r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; -- cgit v1.1 From eb9fc8ef7cb1362374e55d9503e3e7458f319991 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 25 Mar 2010 09:28:24 -0600 Subject: x86/PCI: for host bridge address space collisions, show conflicting resource With insert_resource_conflict(), we can learn what the actual conflict is, so print that info for debugging purposes. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 6e22454..75ac3f8 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -122,7 +122,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - struct resource *root; + struct resource *root, *conflict; u64 start, end; status = resource_to_addr(acpi_res, &addr); @@ -157,9 +157,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } - if (insert_resource(root, res)) { + conflict = insert_resource_conflict(root, res); + if (conflict) { dev_err(&info->bridge->dev, - "can't allocate host bridge window %pR\n", res); + "address space collision: host bridge window %pR " + "conflicts with %s %pR\n", + res, conflict->name, conflict); } else { pci_bus_add_resource(info->bus, res, 0); info->res_num++; -- cgit v1.1 From d558b483d5a73f5718705b270cb2090f66ea48c8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 25 Mar 2010 09:28:30 -0600 Subject: x86/PCI: truncate _CRS windows with _LEN > _MAX - _MIN + 1 Yanko's GA-MA78GM-S2H (BIOS F11) reports the following resource in a PCI host bridge _CRS: [07] 32-Bit DWORD Address Space Resource Min Relocatability : MinFixed Max Relocatability : MaxFixed Address Minimum : CFF00000 (_MIN) Address Maximum : FEBFFFFF (_MAX) Address Length : 3EE10000 (_LEN) This is invalid per spec (ACPI 4.0, 6.4.3.5) because it's a fixed size, fixed location descriptor, but _LEN != _MAX - _MIN + 1. Based on https://bugzilla.kernel.org/show_bug.cgi?id=15480#c15, I think Windows handles this by truncating the window so it fits between _MIN and _MAX. I also verified this by modifying the SeaBIOS DSDT and booting Windows 2008 R2 with qemu. This patch makes Linux truncate the window, too, which fixes: http://bugzilla.kernel.org/show_bug.cgi?id=15480 Signed-off-by: Bjorn Helgaas Tested-by: Yanko Kaneti Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 75ac3f8..e311602 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -123,7 +123,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) acpi_status status; unsigned long flags; struct resource *root, *conflict; - u64 start, end; + u64 start, end, max_len; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -140,6 +140,17 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; + max_len = addr.maximum - addr.minimum + 1; + if (addr.address_length > max_len) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window length %#llx doesn't fit in " + "%#llx-%#llx, trimming\n", + (unsigned long long) addr.address_length, + (unsigned long long) addr.minimum, + (unsigned long long) addr.maximum); + addr.address_length = max_len; + } + start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; -- cgit v1.1 From 596b711ed6b5235f8545680ef38ace00f9898c32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 28 Mar 2010 19:42:54 -0700 Subject: x86: Make smp_locks end with page alignment Fix: ------------[ cut here ]------------ WARNING: at arch/x86/mm/init.c:342 free_init_pages+0x4c/0xfa() free_init_pages: range [0x40daf000, 0x40db5c24] is not aligned Modules linked in: Pid: 0, comm: swapper Not tainted 2.6.34-rc2-tip-03946-g4f16b23-dirty #50 Call Trace: [<40232e9f>] warn_slowpath_common+0x65/0x7c [<4021c9f0>] ? free_init_pages+0x4c/0xfa [<40881434>] ? _etext+0x0/0x24 [<40232eea>] warn_slowpath_fmt+0x24/0x27 [<4021c9f0>] free_init_pages+0x4c/0xfa [<40881434>] ? _etext+0x0/0x24 [<40d3f4bd>] alternative_instructions+0xf6/0x100 [<40d3fe4f>] check_bugs+0xbd/0xbf [<40d398a7>] start_kernel+0x2d5/0x2e4 [<40d390ce>] i386_start_kernel+0xce/0xd5 ---[ end trace 4eaa2a86a8e2da22 ]--- Comments in vmlinux.lds.S already said: | /* | * smp_locks might be freed after init | * start/end must be page aligned | */ Signed-off-by: Yinghai Lu Acked-by: Johannes Weiner Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Linus Torvalds LKML-Reference: <1269830604-26214-2-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 44879df..2cc2497 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,8 +291,8 @@ SECTIONS .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) - __smp_locks_end = .; . = ALIGN(PAGE_SIZE); + __smp_locks_end = .; } #ifdef CONFIG_X86_64 -- cgit v1.1 From c967da6a0ba837f762042e931d4afcf72045547c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 28 Mar 2010 19:42:55 -0700 Subject: x86: Make sure free_init_pages() frees pages on page boundary When CONFIG_NO_BOOTMEM=y, it could use memory more effiently, or in a more compact fashion. Example: Allocated new RAMDISK: 00ec2000 - 0248ce57 Move RAMDISK from 000000002ea04000 - 000000002ffcee56 to 00ec2000 - 0248ce56 The new RAMDISK's end is not page aligned. Last page could be shared with other users. When free_init_pages are called for initrd or .init, the page could be freed and we could corrupt other data. code segment in free_init_pages(): | for (; addr < end; addr += PAGE_SIZE) { | ClearPageReserved(virt_to_page(addr)); | init_page_count(virt_to_page(addr)); | memset((void *)(addr & ~(PAGE_SIZE-1)), | POISON_FREE_INITMEM, PAGE_SIZE); | free_page(addr); | totalram_pages++; | } last half page could be used as one whole free page. So page align the boundaries. -v2: make the original initramdisk to be aligned, according to Johannes, otherwise we have the chance to lose one page. we still need to keep initrd_end not aligned, otherwise it could confuse decompressor. -v3: change to WARN_ON instead, suggested by Johannes. -v4: use PAGE_ALIGN, suggested by Johannes. We may fix that macro name later to PAGE_ALIGN_UP, and PAGE_ALIGN_DOWN Add comments about assuming ramdisk start is aligned in relocate_initrd(), change to re get ramdisk_image instead of save it to make diff smaller. Add warning for wrong range, suggested by Johannes. -v6: remove one WARN() We need to align beginning in free_init_pages() do not copy more than ramdisk_size, noticed by Johannes Reported-by: Stanislaw Gruszka Tested-by: Stanislaw Gruszka Signed-off-by: Yinghai Lu Acked-by: Johannes Weiner Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Linus Torvalds LKML-Reference: <1269830604-26214-3-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head32.c | 4 +++- arch/x86/kernel/head64.c | 3 ++- arch/x86/kernel/setup.c | 10 ++++++---- arch/x86/mm/init.c | 32 ++++++++++++++++++++++++++------ 4 files changed, 37 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index adedeef..b2e2460 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -44,9 +45,10 @@ void __init i386_start_kernel(void) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5a9896..7147143 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d7ba1a..d76e185 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -314,16 +314,17 @@ static void __init reserve_brk(void) #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { - + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 area_size = PAGE_ALIGN(ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, + ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, PAGE_SIZE); if (ramdisk_here == -1ULL) @@ -332,7 +333,7 @@ static void __init relocate_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, + reserve_early(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -376,9 +377,10 @@ static void __init relocate_initrd(void) static void __init reserve_initrd(void) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e71c5cb..452ee5b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -331,11 +331,23 @@ int devmem_is_allowed(unsigned long pagenr) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr = begin; + unsigned long addr; + unsigned long begin_aligned, end_aligned; - if (addr >= end) + /* Make sure boundaries are page aligned */ + begin_aligned = PAGE_ALIGN(begin); + end_aligned = end & PAGE_MASK; + + if (WARN_ON(begin_aligned != begin || end_aligned != end)) { + begin = begin_aligned; + end = end_aligned; + } + + if (begin >= end) return; + addr = begin; + /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will @@ -343,7 +355,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ #ifdef CONFIG_DEBUG_PAGEALLOC printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); + begin, end); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else /* @@ -358,8 +370,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } @@ -376,6 +387,15 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + /* + * end could be not aligned, and We can not align that, + * decompresser could be confused by aligned initrd_end + * We already reserve the end partial page before in + * - i386_start_kernel() + * - x86_64_start_kernel() + * - relocate_initrd() + * So here We can do PAGE_ALIGN() safely to get partial page to be freed + */ + free_init_pages("initrd memory", start, PAGE_ALIGN(end)); } #endif -- cgit v1.1 From e49a5bd38159dfb1928fd25b173bc9de4bbadb21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Mar 2010 19:40:03 +0100 Subject: perf: Use hot regs with software sched switch/migrate events Scheduler's task migration events don't work because they always pass NULL regs perf_sw_event(). The event hence gets filtered in perf_swevent_add(). Scheduler's context switches events use task_pt_regs() to get the context when the event occured which is a wrong thing to do as this won't give us the place in the kernel where we went to sleep but the place where we left userspace. The result is even more wrong if we switch from a kernel thread. Use the hot regs snapshot for both events as they belong to the non-interrupt/exception based events family. Unlike page faults or so that provide the regs matching the exact origin of the event, we need to save the current context. This makes the task migration event working and fix the context switch callchains and origin ip. Example: perf record -a -e cs Before: 10.91% ksoftirqd/0 0 [k] 0000000000000000 | --- (nil) perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_swevent_ctx_event do_perf_sw_event __perf_sw_event perf_event_task_sched_out schedule run_ksoftirqd kthread kernel_thread_helper After: 23.77% hald-addon-stor [kernel.kallsyms] [k] schedule | --- schedule | |--60.00%-- schedule_timeout | wait_for_common | wait_for_completion | blk_execute_rq | scsi_execute | scsi_execute_req | sr_test_unit_ready | | | |--66.67%-- sr_media_change | | media_changed | | cdrom_media_changed | | sr_block_media_changed | | check_disk_change | | cdrom_open v2: Always build perf_arch_fetch_caller_regs() now that software events need that too. They don't need it from modules, unlike trace events, so we keep the EXPORT_SYMBOL in trace_event_perf.c Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar Cc: David Miller --- arch/x86/kernel/cpu/perf_event.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 60398a0..5fb490c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1702,7 +1702,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -#ifdef CONFIG_EVENT_TRACING void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { regs->ip = ip; @@ -1714,4 +1713,3 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski regs->cs = __KERNEL_CS; local_save_flags(regs->flags); } -#endif -- cgit v1.1 From ab310b5edb8b601bcb02491ed6f7676da4fd1757 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 30 Mar 2010 14:05:07 -0500 Subject: x86,kgdb: Always initialize the hw breakpoint attribute It is required to call hw_breakpoint_init() on an attr before using it in any other calls. This fixes the problem where kgdb will sometimes fail to initialize on x86_64. Signed-off-by: Jason Wessel Cc: Ingo Molnar Cc: 2.6.33 LKML-Reference: <1269975907-27602-1-git-send-email-jason.wessel@windriver.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index bfba601..b2258ca 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -618,8 +618,8 @@ int kgdb_arch_init(void) * portion of kgdb because this operation requires mutexs to * complete. */ + hw_breakpoint_init(&attr); attr.bp_addr = (unsigned long)kgdb_arch_init; - attr.type = PERF_TYPE_BREAKPOINT; attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; -- cgit v1.1 From 85257024096a96fc5c00ce59d685f62bbed3ad95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Mar 2010 19:30:52 +0100 Subject: x86: Move notify_cpu_starting() callback to a later stage Because we need to have cpu identification things done by the time we run CPU_STARTING notifiers. ( This init ordering will be relied on by the next fix. ) Signed-off-by: Peter Zijlstra LKML-Reference: <1269353485.5109.48.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 06d98ae..6808b93 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -242,8 +242,6 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); - notify_cpu_starting(cpuid); - /* * Need to setup vector mappings before we enable interrupts. */ @@ -264,6 +262,8 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_cpu_starting(cpuid); + /* * Allow the master to continue. */ -- cgit v1.1 From b38b24ead33417146e051453d04bf60b8d2d7e25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Mar 2010 19:31:15 +0100 Subject: perf, x86: Fix AMD hotplug & constraint initialization Commit 3f6da39 ("perf: Rework and fix the arch CPU-hotplug hooks") moved the amd northbridge allocation from CPUS_ONLINE to CPUS_PREPARE_UP however amd_nb_id() doesn't work yet on prepare so it would simply bail basically reverting to a state where we do not properly track node wide constraints - causing weird perf results. Fix up the AMD NorthBridge initialization code by allocating from CPU_UP_PREPARE and installing it from CPU_STARTING once we have the proper nb_id. It also properly deals with the allocation failing. Signed-off-by: Peter Zijlstra [ robustify using amd_has_nb() ] Signed-off-by: Stephane Eranian LKML-Reference: <1269353485.5109.48.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 8 ++-- arch/x86/kernel/cpu/perf_event_amd.c | 80 +++++++++++++++++++++--------------- 2 files changed, 52 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5fb490c..bd28cf9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -158,7 +158,7 @@ struct x86_pmu { struct perf_event *event); struct event_constraint *event_constraints; - void (*cpu_prepare)(int cpu); + int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); @@ -1333,11 +1333,12 @@ static int __cpuinit x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; + int ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: if (x86_pmu.cpu_prepare) - x86_pmu.cpu_prepare(cpu); + ret = x86_pmu.cpu_prepare(cpu); break; case CPU_STARTING: @@ -1350,6 +1351,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) x86_pmu.cpu_dying(cpu); break; + case CPU_UP_CANCELED: case CPU_DEAD: if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); @@ -1359,7 +1361,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; } - return NOTIFY_OK; + return ret; } static void __init pmu_check_apic(void) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index b87e0b6..db6f7d4 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -137,6 +137,13 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc) return (hwc->config & 0xe0) == 0xe0; } +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + static void amd_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -147,7 +154,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, /* * only care about NB events */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return; /* @@ -214,7 +221,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) /* * if not NB event or no NB, then no constraints */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return &unconstrained; /* @@ -293,51 +300,55 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) return nb; } -static void amd_pmu_cpu_online(int cpu) +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu, -1); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) { - struct cpu_hw_events *cpu1, *cpu2; - struct amd_nb *nb = NULL; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; int i, nb_id; if (boot_cpu_data.x86_max_cores < 2) return; - /* - * function may be called too early in the - * boot process, in which case nb_id is bogus - */ nb_id = amd_get_nb_id(cpu); - if (nb_id == BAD_APICID) - return; - - cpu1 = &per_cpu(cpu_hw_events, cpu); - cpu1->amd_nb = NULL; + WARN_ON_ONCE(nb_id == BAD_APICID); raw_spin_lock(&amd_nb_lock); for_each_online_cpu(i) { - cpu2 = &per_cpu(cpu_hw_events, i); - nb = cpu2->amd_nb; - if (!nb) + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) continue; - if (nb->nb_id == nb_id) - goto found; - } - nb = amd_alloc_nb(cpu, nb_id); - if (!nb) { - pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); - raw_spin_unlock(&amd_nb_lock); - return; + if (nb->nb_id == nb_id) { + kfree(cpuc->amd_nb); + cpuc->amd_nb = nb; + break; + } } -found: - nb->refcnt++; - cpu1->amd_nb = nb; + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; raw_spin_unlock(&amd_nb_lock); } -static void amd_pmu_cpu_offline(int cpu) +static void amd_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuhw; @@ -349,8 +360,10 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_lock(&amd_nb_lock); if (cpuhw->amd_nb) { - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); cpuhw->amd_nb = NULL; } @@ -379,8 +392,9 @@ static __initconst struct x86_pmu amd_pmu = { .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, - .cpu_prepare = amd_pmu_cpu_online, - .cpu_dead = amd_pmu_cpu_offline, + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, }; static __init int amd_pmu_init(void) -- cgit v1.1 From 257ef9d21f1b008a6c7425544b36641c4325a922 Mon Sep 17 00:00:00 2001 From: Torok Edwin Date: Wed, 17 Mar 2010 12:07:16 +0200 Subject: perf, x86: Fix callgraphs of 32-bit processes on 64-bit kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When profiling a 32-bit process on a 64-bit kernel, callgraph tracing stopped after the first function, because it has seen a garbage memory address (tried to interpret the frame pointer, and return address as a 64-bit pointer). Fix this by using a struct stack_frame with 32-bit pointers when the TIF_IA32 flag is set. Note that TIF_IA32 flag must be used, and not is_compat_task(), because the latter is only set when the 32-bit process is executing a syscall, which may not always be the case (when tracing page fault events for example). Signed-off-by: Török Edwin Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Paul Mackerras Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org LKML-Reference: <1268820436-13145-1-git-send-email-edwintorok@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 44 +++++++++++++++++++++++++++++++++++----- arch/x86/kernel/dumpstack.h | 5 +++++ 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bd28cf9..53ea4cf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -28,6 +28,7 @@ #include #include #include +#include static u64 perf_event_mask __read_mostly; @@ -1630,14 +1631,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +#ifdef CONFIG_COMPAT +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bytes; + /* 32-bit process in 64-bit kernel. */ + struct stack_frame_ia32 frame; + const void __user *fp; + + if (!test_thread_flag(TIF_IA32)) + return 0; + + fp = compat_ptr(regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) + break; + + if (fp < compat_ptr(regs->sp)) + break; - return bytes == sizeof(*frame); + callchain_store(entry, frame.return_address); + fp = compat_ptr(frame.next_frame); + } + return 1; } +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + return 0; +} +#endif static void perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) @@ -1653,11 +1682,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); + if (perf_callchain_user32(regs, entry)) + return; + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) break; if ((unsigned long)fp < regs->sp) diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 29e5f7c..e39e771 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -30,6 +30,11 @@ struct stack_frame { unsigned long return_address; }; +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + static inline unsigned long rewind_frame_pointer(int n) { struct stack_frame *frame; -- cgit v1.1