From 73a0e614580fb650846be1e9315f6b7b6069b9cc Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 6 Apr 2010 13:24:08 -0600 Subject: x86/PCI: ignore Consumer/Producer bit in ACPI window descriptions ACPI Address Space Descriptors (used in _CRS) have a Consumer/Producer bit that is supposed to distinguish regions that are consumed directly by a device from those that are forwarded ("produced") by a bridge. But BIOSes have apparently not used this consistently, and Windows seems to ignore it, so I think Linux should ignore it as well. I can't point to any of these supposed broken BIOSes, but since we now rely on _CRS by default, I think it's safer to ignore this bit from the start. Here are details of my experiments with how Windows handles it: https://bugzilla.kernel.org/show_bug.cgi?id=15701 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c7b1ebf..334153c 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -71,8 +71,7 @@ resource_to_addr(struct acpi_resource *resource, if (ACPI_SUCCESS(status) && (addr->resource_type == ACPI_MEMORY_RANGE || addr->resource_type == ACPI_IO_RANGE) && - addr->address_length > 0 && - addr->producer_consumer == ACPI_PRODUCER) { + addr->address_length > 0) { return AE_OK; } return AE_ERROR; -- cgit v1.1 From ae7c9b70dcb4313ea3dbcc9a2f240dae6c2b50c0 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 19 Apr 2010 11:23:43 -0700 Subject: x86, mrst: Conditionally register cpu hotplug notifier for apbt APB timer is used on Moorestown platforms but not on a standard PC. If APB timer code is compiled in but not initialized at run-time due to lack of FW reported SFI table, kernel would panic when the non-boot CPUs are offlined and notifier is called. https://bugzilla.kernel.org/show_bug.cgi?id=15786 This patch ensures CPU hotplug notifier for APB timer is only registered when the APBT timer block is initialized. Signed-off-by: Jacob Pan LKML-Reference: <1271701423-1162-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apb_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index ff469e4..a353475 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -429,7 +429,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (disable_apbt_percpu) + if (disable_apbt_percpu || !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ hotcpu_notifier(apbt_cpuhp_notify, -20); -- cgit v1.1 From 66528fdd45b082bf7c74687d72ae08afa4a446f8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 20 Apr 2010 13:52:41 -0600 Subject: x86/PCI: parse additional host bridge window resource types This adds support for Memory24, Memory32, and Memory32Fixed descriptors in PCI host bridge _CRS. I experimentally determined that Windows (2008 R2) accepts these descriptors and treats them as windows that are forwarded to the PCI bus, e.g., if it finds any PCI devices with BARs outside the windows, it moves them into the windows. I don't know whether any machines actually use these descriptors in PCI host bridge _CRS methods, but if any exist and they're new enough that we automatically turn on "pci=use_crs", they will work with Windows but not with Linux. Here are the details: https://bugzilla.kernel.org/show_bug.cgi?id=15817 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 334153c..44f83ce 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -66,13 +66,44 @@ resource_to_addr(struct acpi_resource *resource, struct acpi_resource_address64 *addr) { acpi_status status; - - status = acpi_resource_to_address64(resource, addr); - if (ACPI_SUCCESS(status) && - (addr->resource_type == ACPI_MEMORY_RANGE || - addr->resource_type == ACPI_IO_RANGE) && - addr->address_length > 0) { + struct acpi_resource_memory24 *memory24; + struct acpi_resource_memory32 *memory32; + struct acpi_resource_fixed_memory32 *fixed_memory32; + + memset(addr, 0, sizeof(*addr)); + switch (resource->type) { + case ACPI_RESOURCE_TYPE_MEMORY24: + memory24 = &resource->data.memory24; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory24->minimum; + addr->address_length = memory24->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_MEMORY32: + memory32 = &resource->data.memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory32->minimum; + addr->address_length = memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; return AE_OK; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + fixed_memory32 = &resource->data.fixed_memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = fixed_memory32->address; + addr->address_length = fixed_memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_ADDRESS16: + case ACPI_RESOURCE_TYPE_ADDRESS32: + case ACPI_RESOURCE_TYPE_ADDRESS64: + status = acpi_resource_to_address64(resource, addr); + if (ACPI_SUCCESS(status) && + (addr->resource_type == ACPI_MEMORY_RANGE || + addr->resource_type == ACPI_IO_RANGE) && + addr->address_length > 0) { + return AE_OK; + } + break; } return AE_ERROR; } -- cgit v1.1 From 7ce5a2b9bb2e92902230e3121d8c3047fab9cb47 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 23 Apr 2010 16:17:40 -0700 Subject: x86-64: Clear a 64-bit FS/GS base on fork if selector is nonzero When we do a thread switch, we clear the outgoing FS/GS base if the corresponding selector is nonzero. This is taken by __switch_to() as an entry invariant; it does not verify that it is true on entry. However, copy_thread() doesn't enforce this constraint, which can result in inconsistent results after fork(). Make copy_thread() match the behavior of __switch_to(). Reported-and-tested-by: Samuel Thibault Signed-off-by: H. Peter Anvin LKML-Reference: <4BD1E061.8030605@zytor.com> Cc: --- arch/x86/kernel/process_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index dc9690b..17cb329 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -276,12 +276,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, set_tsk_thread_flag(p, TIF_FORK); - p->thread.fs = me->thread.fs; - p->thread.gs = me->thread.gs; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); + p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; savesegment(fs, p->thread.fsindex); + p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); -- cgit v1.1 From 7a0fc404ae663776e96db43879a0fa24fec1fa3a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Apr 2010 14:40:54 -0700 Subject: x86: Disable large pages on CPUs with Atom erratum AAE44 Atom erratum AAE44/AAF40/AAG38/AAH41: "If software clears the PS (page size) bit in a present PDE (page directory entry), that will cause linear addresses mapped through this PDE to use 4-KByte pages instead of using a large page after old TLB entries are invalidated. Due to this erratum, if a code fetch uses this PDE before the TLB entry for the large page is invalidated then it may fetch from a different physical address than specified by either the old large page translation or the new 4-KByte page translation. This erratum may also cause speculative code fetches from incorrect addresses." [http://download.intel.com/design/processor/specupdt/319536.pdf] Where as commit 211b3d03c7400f48a781977a50104c9d12f4e229 seems to workaround errata AAH41 (mixed 4K TLBs) it reduces the window of opportunity for the bug to occur and does not totally remove it. This patch disables mixed 4K/4MB page tables totally avoiding the page splitting and not tripping this processor issue. This is based on an original patch by Colin King. Originally-by: Colin Ian King Cc: Colin Ian King Cc: Ingo Molnar Signed-off-by: H. Peter Anvin LKML-Reference: <1269271251-19775-1-git-send-email-colin.king@canonical.com> Cc: --- arch/x86/kernel/cpu/intel.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7e1cca1..1366c7c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -47,6 +47,27 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * + * A race condition between speculative fetches and invalidating + * a large page. This is worked around in microcode, but we + * need the microcode to have already been loaded... so if it is + * not, recommend a BIOS update and disable large pages. + */ + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) { + u32 ucode, junk; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, junk, ucode); + + if (ucode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); + } + } + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #else -- cgit v1.1 From 453dc65931915abc61f92e12bba1fc4747ff5542 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Apr 2010 13:18:08 -0400 Subject: VMware Balloon driver This is a standalone version of VMware Balloon driver. Ballooning is a technique that allows hypervisor dynamically limit the amount of memory available to the guest (with guest cooperation). In the overcommit scenario, when hypervisor set detects that it needs to shuffle some memory, it instructs the driver to allocate certain number of pages, and the underlying memory gets returned to the hypervisor. Later hypervisor may return memory to the guest by reattaching memory to the pageframes and instructing the driver to "deflate" balloon. We are submitting a standalone driver because KVM maintainer (Avi Kivity) expressed opinion (rightly) that our transport does not fit well into virtqueue paradigm and thus it does not make much sense to integrate with virtio. There were also some concerns whether current ballooning technique is the right thing. If there appears a better framework to achieve this we are prepared to evaluate and switch to using it, but in the meantime we'd like to get this driver upstream. We want to get the driver accepted in distributions so that users do not have to deal with an out-of-tree module and many distributions have "upstream first" requirement. The driver has been shipping for a number of years and users running on VMware platform will have it installed as part of VMware Tools even if it will not come from a distribution, thus there should not be additional risk in pulling the driver into mainline. The driver will only activate if host is VMware so everyone else should not be affected at all. Signed-off-by: Dmitry Torokhov Cc: Avi Kivity Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/vmware.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 1cbed97..dfdb4db 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,6 +22,7 @@ */ #include +#include #include #include #include @@ -101,6 +102,7 @@ int vmware_platform(void) return 0; } +EXPORT_SYMBOL(vmware_platform); /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. -- cgit v1.1 From 55051feb57eba600b366006757304a0af3ada2bd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 23 Apr 2010 17:05:24 -0600 Subject: x86/PCI: never allocate PCI MMIO resources below BIOS_END When we move a PCI device or assign resources to a device not configured by the BIOS, we want to avoid the BIOS region below 1MB. Note that if the BIOS places devices below 1MB, we leave them there. See https://bugzilla.kernel.org/show_bug.cgi?id=15744 and https://bugzilla.kernel.org/show_bug.cgi?id=15841 Tested-by: Andy Isaacson Tested-by: Andy Bailey Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 46fd43f..97da2ba 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -72,6 +72,9 @@ pcibios_align_resource(void *data, const struct resource *res, return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; + } else if (res->flags & IORESOURCE_MEM) { + if (start < BIOS_END) + start = BIOS_END; } return start; } -- cgit v1.1 From 48728e077480910df45baabc5f87b04276348c90 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Apr 2010 14:45:43 -0600 Subject: x86/PCI: compute Address Space length rather than using _LEN ACPI _CRS Address Space Descriptors have _MIN, _MAX, and _LEN. Linux has been computing Address Spaces as [_MIN to _MIN + _LEN - 1]. Based on the tests in the bug reports below, Windows apparently uses [_MIN to _MAX]. Per spec (ACPI 4.0, Table 6-40), for _CRS fixed-size, fixed location descriptors, "_LEN must be (_MAX - _MIN + 1)", and when that's true, it doesn't matter which way we compute the end. But of course, there are BIOSes that don't follow this rule, and we're better off if Linux handles those exceptions the same way as Windows. This patch makes Linux use [_MIN to _MAX], as Windows seems to do. This effectively reverts d558b483d5 and 03db42adfe and replaces them with simpler code. https://bugzilla.kernel.org/show_bug.cgi?id=14337 (round) https://bugzilla.kernel.org/show_bug.cgi?id=15480 (truncate) Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 44f83ce..31930fd 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -121,30 +121,6 @@ count_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static void -align_resource(struct acpi_device *bridge, struct resource *res) -{ - int align = (res->flags & IORESOURCE_MEM) ? 16 : 4; - - /* - * Host bridge windows are not BARs, but the decoders on the PCI side - * that claim this address space have starting alignment and length - * constraints, so fix any obvious BIOS goofs. - */ - if (!IS_ALIGNED(res->start, align)) { - dev_printk(KERN_DEBUG, &bridge->dev, - "host bridge window %pR invalid; " - "aligning start to %d-byte boundary\n", res, align); - res->start &= ~(align - 1); - } - if (!IS_ALIGNED(res->end + 1, align)) { - dev_printk(KERN_DEBUG, &bridge->dev, - "host bridge window %pR invalid; " - "aligning end to %d-byte boundary\n", res, align); - res->end = ALIGN(res->end, align) - 1; - } -} - static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) { @@ -154,7 +130,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) acpi_status status; unsigned long flags; struct resource *root, *conflict; - u64 start, end, max_len; + u64 start, end; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -171,19 +147,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; - max_len = addr.maximum - addr.minimum + 1; - if (addr.address_length > max_len) { - dev_printk(KERN_DEBUG, &info->bridge->dev, - "host bridge window length %#llx doesn't fit in " - "%#llx-%#llx, trimming\n", - (unsigned long long) addr.address_length, - (unsigned long long) addr.minimum, - (unsigned long long) addr.maximum); - addr.address_length = max_len; - } - start = addr.minimum + addr.translation_offset; - end = start + addr.address_length - 1; + end = addr.maximum + addr.translation_offset; res = &info->res[info->res_num]; res->name = info->name; @@ -191,7 +156,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->start = start; res->end = end; res->child = NULL; - align_resource(info->bridge, res); if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, -- cgit v1.1