From e0c1e9bf81badc7ba59e120d6218101903d5d103 Mon Sep 17 00:00:00 2001 From: Kimball Murray Date: Mon, 8 May 2006 15:17:16 +0200 Subject: [PATCH] x86_64: avoid IRQ0 ioapic pin collision The patch addresses a problem with ACPI SCI interrupt entry, which gets re-used, and the IRQ is assigned to another unrelated device. The patch corrects the code such that SCI IRQ is skipped and duplicate entry is avoided. Second issue came up with VIA chipset, the problem was caused by original patch assigning IRQs starting 16 and up. The VIA chipset uses 4-bit IRQ register for internal interrupt routing, and therefore cannot handle IRQ numbers assigned to its devices. The patch corrects this problem by allowing PCI IRQs below 16. Cc: len.brown@intel.com Signed-off by: Natalie Protasevich Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/io_apic.c | 5 +++++ arch/x86_64/kernel/mpparse.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 77b4c60..0de3ea9 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1777,6 +1777,8 @@ static inline void unlock_ExtINT_logic(void) spin_unlock_irqrestore(&ioapic_lock, flags); } +int timer_uses_ioapic_pin_0; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -1814,6 +1816,9 @@ static inline void check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; + if (pin1 == 0) + timer_uses_ioapic_pin_0 = 1; + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index b17cf3e..083da7e 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -968,7 +968,17 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) */ int irq = gsi; if (gsi < MAX_GSI_NUM) { - if (gsi > 15) + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) gsi = pci_irq++; /* * Don't assign IRQ used by ACPI SCI -- cgit v1.1 From 5192d84e4c32cd335fd572e5ff0712041f45f7e7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 8 May 2006 15:17:19 +0200 Subject: [PATCH] x86_64: Check for too many northbridges in IOMMU code The IOMMU code can only deal with 8 northbridges. Error out when more are found. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-gart.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 9d3d76c..2480d3f 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -639,6 +639,14 @@ static int __init pci_iommu_init(void) return -1; } + i = 0; + for_all_nb(dev) + i++; + if (i > MAX_NB) { + printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i); + return -1; + } + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); aper_size = info.aper_size * 1024 * 1024; iommu_size = check_iommu_size(info.aper_base, aper_size); -- cgit v1.1 From cdc60a4c8e71c4bcf67e83fac6c0cabd0ff19bfe Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 8 May 2006 15:17:22 +0200 Subject: [PATCH] x86_64: fix die_lock nesting I noticed this when poking around in this area. The oops_begin() function in x86_64 would only conditionally claim the die_lock if the call is nested, but oops_end() would always release the spinlock. This patch adds a nest count for the die lock so that the release of the lock is only done on the final oops_end(). Signed-off-by: Corey Minyard Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 2700b13..0ebb281 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -385,6 +385,7 @@ void out_of_line_bug(void) static DEFINE_SPINLOCK(die_lock); static int die_owner = -1; +static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { @@ -399,6 +400,7 @@ unsigned __kprobes long oops_begin(void) else spin_lock(&die_lock); } + die_nest_count++; die_owner = cpu; console_verbose(); bust_spinlocks(1); @@ -409,7 +411,13 @@ void __kprobes oops_end(unsigned long flags) { die_owner = -1; bust_spinlocks(0); - spin_unlock_irqrestore(&die_lock, flags); + die_nest_count--; + if (die_nest_count) + /* We still own the lock */ + local_irq_restore(flags); + else + /* Nest count reaches zero, release the lock. */ + spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) panic("Oops"); } -- cgit v1.1 From 8b1ffe9550e71224c43d8c754245bd76f4ea9bb8 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 8 May 2006 15:17:25 +0200 Subject: [PATCH] x86_64: add nmi_exit to die_nmi Playing with NMI watchdog on x86_64, I discovered that it didn't do what I expected. It always panic-ed, even when it didn't happen from interrupt context. This patch solves that problem for me. Also, in this case, do_exit() will be called with interrupts disabled, I believe. Would it be wise to also call local_irq_enable() after nmi_exit()? [Yes I added it -AK] Currently, on x86_64, any NMI watchdog timeout will cause a panic because the irq count will always be set to be in an interrupt when do_exit() is called from die_nmi(). If we add nmi_exit() to the die_nmi() call (since the nmi will never exit "normally") it seems to solve this problem. The following small program can be used to trigger the NMI watchdog to reproduce this: main () { iopl(3); for (;;) asm("cli"); } Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 0ebb281..6b87268 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -472,6 +472,8 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) panic("nmi watchdog"); printk("console shuts up ...\n"); oops_end(flags); + nmi_exit(); + local_irq_enable(); do_exit(SIGSEGV); } -- cgit v1.1 From ac71d12c990526b01ef6cfe50907ef8530a30331 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 8 May 2006 15:17:28 +0200 Subject: [PATCH] x86_64: Avoid EBDA area in early boot allocator Based on analysis&patch from Robert Hentosch Observed on a Dell PE6850 with 16GB The problem occurs very early on, when the kernel allocates space for the temporary memory map called bootmap. The bootmap overlaps the EBDA region. EBDA region is not historically reserved in the e820 mapping. When the bootmap is freed it marks the EBDA region as usable. If you notice in setup.c there is already code to work around the EBDA in reserve_ebda_region(), this check however occurs after the bootmap is allocated and doesn't prevent the bootmap from using this range. AK: I redid the original patch. Thanks also to Jan Beulich for spotting some mistakes. Cc: Robert_Hentosch@dell.com Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/e820.c | 6 ++++++ arch/x86_64/kernel/setup.c | 30 ++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 62776c0..222b5b4 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -76,6 +76,12 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) *addrp = __pa_symbol(&_end); return 1; } + + if (last >= ebda_addr && addr < ebda_addr + ebda_size) { + *addrp = ebda_addr + ebda_size; + return 1; + } + /* XXX ramdisk image here? */ return 0; } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index ebc3c33..f0870be 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -571,17 +571,28 @@ static inline void copy_edd(void) #endif #define EBDA_ADDR_POINTER 0x40E -static void __init reserve_ebda_region(void) + +unsigned __initdata ebda_addr; +unsigned __initdata ebda_size; + +static void discover_ebda(void) { - unsigned int addr; - /** + /* * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E */ - addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); - addr <<= 4; - if (addr) - reserve_bootmem_generic(addr, PAGE_SIZE); + ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr <<= 4; + + ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; } void __init setup_arch(char **cmdline_p) @@ -627,6 +638,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); + discover_ebda(); + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); dmi_scan_machine(); @@ -669,7 +682,8 @@ void __init setup_arch(char **cmdline_p) reserve_bootmem_generic(0, PAGE_SIZE); /* reserve ebda region */ - reserve_ebda_region(); + if (ebda_addr) + reserve_bootmem_generic(ebda_addr, ebda_size); #ifdef CONFIG_SMP /* -- cgit v1.1 From f0fdabf8bf187c9aafeb139a828c530ef45cf022 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 May 2006 18:19:38 +0200 Subject: [PATCH] x86_64: Don't warn for overflow in nommu case when dma_mask is < 32bit This triggers for b44's 1GB DMA workaround which tries to map first and then bounces. The 32bit heuristic is reasonable because the IOMMU doesn't attempt to handle < 32bit masks anyways. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-nommu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 44adcc2..1f6ecc6 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -12,9 +12,10 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { if (hwdev && bus + size > *hwdev->dma_mask) { - printk(KERN_ERR - "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", - name, (long long)bus, size, (long long)*hwdev->dma_mask); + if (*hwdev->dma_mask >= 0xffffffffULL) + printk(KERN_ERR + "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", + name, (long long)bus, size, (long long)*hwdev->dma_mask); return 0; } return 1; -- cgit v1.1 From fad7906d16e8c4926aeb5b0f1756eb9f55b2837d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 May 2006 18:19:44 +0200 Subject: [PATCH] x86_64: Fix memory hotadd heuristics This fixes some boot failures on Dell and Unisys systems with memory hotadd added. - Set hotadd_percent to 0 by default. This means anybody using hotadd memory needs to specify the value on the command line. That's because there are lots of Intel boxes which have a bogus hotplug area in their SRAT and they would waste a lot of memory before. - Fix calculation of how much memory to use when the hotplug area exceeds hotadd_percent - Fix fallback when the - Fix fallback if memory hotadd is not compiled in. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/mm/srat.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 15ae9fc..e151353 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -34,7 +34,10 @@ static nodemask_t nodes_found __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES] __initdata; static int found_add_area __initdata; -int hotadd_percent __initdata = 10; +int hotadd_percent __initdata = 0; +#ifndef RESERVE_HOTADD +#define hotadd_percent 0 /* Ignore all settings */ +#endif static u8 pxm2node[256] = { [0 ... 255] = 0xff }; /* Too small nodes confuse the VM badly. Usually they result @@ -103,6 +106,7 @@ static __init void bad_srat(void) int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; + found_add_area = 0; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; for (i = 0; i < MAX_NUMNODES; i++) @@ -154,7 +158,8 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) int pxm, node; if (srat_disabled()) return; - if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat(); + if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { + bad_srat(); return; } if (pa->flags.enabled == 0) @@ -191,15 +196,17 @@ static int hotadd_enough_memory(struct bootnode *nd) allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; allowed = (allowed / 100) * hotadd_percent; if (allocated + mem > allowed) { + unsigned long range; /* Give them at least part of their hotadd memory upto hotadd_percent It would be better to spread the limit out over multiple hotplug areas, but that is too complicated right now */ if (allocated >= allowed) return 0; - pages = (allowed - allocated + mem) / sizeof(struct page); + range = allowed - allocated; + pages = (range / PAGE_SIZE); mem = pages * sizeof(struct page); - nd->end = nd->start + pages*PAGE_SIZE; + nd->end = nd->start + range; } /* Not completely fool proof, but a good sanity check */ addr = find_e820_area(last_area_end, end_pfn< Date: Mon, 15 May 2006 18:19:47 +0200 Subject: [PATCH] x86_64: Don't schedule on exception stack on preemptive kernels Extends an earlier patch from John Blackwood to more exception handlers that also run on the exception stacks. Expand the use of preempt_conditional_{sti,cli} to all cases where interrupts are to be re-enabled during exception handling while running on an IST stack. Based on original patch from Jan Beulich. Cc: John Blackwood Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 6b87268..cea335e 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -102,6 +102,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->eflags & X86_EFLAGS_IF) local_irq_disable(); + /* Make sure to not schedule here because we could be running + on an exception stack. */ preempt_enable_no_resched(); } @@ -483,8 +485,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { struct task_struct *tsk = current; - conditional_sti(regs); - tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; @@ -521,6 +521,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, NULL); \ } @@ -535,6 +536,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } @@ -548,7 +550,17 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) DO_ERROR(18, SIGSEGV, "reserved", reserved) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) + +/* Runs on IST stack */ +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) { @@ -682,8 +694,9 @@ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { return; } + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - return; + preempt_conditional_cli(regs); } /* Help handler running on IST stack to switch back to user stack -- cgit v1.1 From dc49e3445aa703eb7fd33c7ddb7e4a7bbcf06d30 Mon Sep 17 00:00:00 2001 From: Satoshi Oshima Date: Sat, 20 May 2006 15:00:21 -0700 Subject: [PATCH] kprobes: bad manipulation of 2 byte opcode on x86_64 Problem: If we put a probe onto a callq instruction and the probe is executed, kernel panic of Bad RIP value occurs. Root cause: If resume_execution() found 0xff at first byte of p->ainsn.insn, it must check the _second_ byte. But current resume_execution check _first_ byte again. I changed it checks second byte of p->ainsn.insn. Kprobes on i386 don't have this problem, because the implementation is a little bit different from x86_64. Cc: Andi Kleen Signed-off-by: Satoshi Oshima Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/kprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 1eaa5da..fa1d19c 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -514,13 +514,13 @@ static void __kprobes resume_execution(struct kprobe *p, *tos = orig_rip + (*tos - copy_rip); break; case 0xff: - if ((*insn & 0x30) == 0x10) { + if ((insn[1] & 0x30) == 0x10) { /* call absolute, indirect */ /* Fix return addr; rip is correct. */ next_rip = regs->rip; *tos = orig_rip + (*tos - copy_rip); - } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ + } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ + ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ /* rip is correct. */ next_rip = regs->rip; } -- cgit v1.1