From a3ea14df0e383f44dcb2e61badb71180dbffe526 Mon Sep 17 00:00:00 2001 From: Paul Fox Date: Tue, 26 Jul 2011 16:42:26 +0100 Subject: x86, olpc: Wait for last byte of EC command to be accepted When executing EC commands, only waiting when there are still more bytes to write is usually fine. However, if the system suspends very quickly after a call to olpc_ec_cmd(), the last data byte may not yet be transferred to the EC, and the command will not complete. This solves a bug where the SCI wakeup mask was not correctly written when going into suspend. It means that sometimes, on XO-1.5 (but not XO-1), the devices that were marked as wakeup sources can't wake up the system. e.g. you ask for wifi wakeups, suspend, but then incoming wifi frames don't wake up the system as they should. Signed-off-by: Paul Fox Signed-off-by: Daniel Drake Acked-by: Andres Salomon Cc: Signed-off-by: Ingo Molnar --- arch/x86/platform/olpc/olpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 8b9940e..7cce722 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -161,13 +161,13 @@ restart: if (inbuf && inlen) { /* write data to EC */ for (i = 0; i < inlen; i++) { + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); + outb(inbuf[i], 0x68); if (wait_on_ibf(0x6c, 0)) { printk(KERN_ERR "olpc-ec: timeout waiting for" " EC accept data!\n"); goto err; } - pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); - outb(inbuf[i], 0x68); } } if (outbuf && outlen) { -- cgit v1.1 From 05e33fc20ea5e493a2a1e7f1d04f43cdf89f83ed Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 5 Aug 2011 09:09:00 -0500 Subject: x86, UV: Remove UV delay in starting slave cpus Delete the 10 msec delay between the INIT and SIPI when starting slave cpus. I can find no requirement for this delay. BIOS also has similar code sequences without the delay. Removing the delay reduces boot time by 40 sec. Every bit helps. Signed-off-by: Jack Steiner Cc: Link: http://lkml.kernel.org/r/20110805140900.GA6774@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index adc66c3..34b1859 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -207,7 +207,6 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; uv_write_global_mmr64(pnode, UVH_IPI_INT, val); - mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | -- cgit v1.1 From 7ca0758cdb7c241cb4e0490a8d95f0eb5b861daf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 22 Aug 2011 13:27:06 -0700 Subject: x86-32, vdso: On system call restart after SYSENTER, use int $0x80 When we enter a 32-bit system call via SYSENTER or SYSCALL, we shuffle the arguments to match the int $0x80 calling convention. This was probably a design mistake, but it's what it is now. This causes errors if the system call as to be restarted. For SYSENTER, we have to invoke the instruction from the vdso as the return address is hardcoded. Accordingly, we can simply replace the jump in the vdso with an int $0x80 instruction and use the slower entry point for a post-restart. Suggested-by: Linus Torvalds Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/CA%2B55aFztZ=r5wa0x26KJQxvZOaQq8s2v3u50wCyJcA-Sc4g8gQ@mail.gmail.com Cc: --- arch/x86/vdso/vdso32/sysenter.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S index e2800af..e354bce 100644 --- a/arch/x86/vdso/vdso32/sysenter.S +++ b/arch/x86/vdso/vdso32/sysenter.S @@ -43,7 +43,7 @@ __kernel_vsyscall: .space 7,0x90 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - jmp .Lenter_kernel + int $0x80 /* 16: System call normal return point is here! */ VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ pop %ebp -- cgit v1.1 From f1c39625d63c9f8eba8f036429c10a9cb9e32920 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 24 Aug 2011 09:54:24 -0700 Subject: xen: use non-tracing preempt in xen_clocksource_read() The tracing code used sched_clock() to get tracing timestamps, which ends up calling xen_clocksource_read(). xen_clocksource_read() must disable preemption, but if preemption tracing is enabled, this results in infinite recursion. I've only noticed this when boot-time tracing tests are enabled, but it seems like a generic bug. It looks like it would also affect kvm_clocksource_read(). Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Jeremy Fitzhardinge Cc: Avi Kivity Cc: Marcelo Tosatti --- arch/x86/xen/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 5158c50..163b467 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -168,9 +168,10 @@ cycle_t xen_clocksource_read(void) struct pvclock_vcpu_time_info *src; cycle_t ret; - src = &get_cpu_var(xen_vcpu)->time; + preempt_disable_notrace(); + src = &__get_cpu_var(xen_vcpu)->time; ret = pvclock_clocksource_read(src); - put_cpu_var(xen_vcpu); + preempt_enable_notrace(); return ret; } -- cgit v1.1 From cbbfa38fcb95930babc5233cf6927ec430f38abc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Aug 2011 19:46:56 +0200 Subject: mtrr: fix UP breakage caused during switch to stop_machine While removing custom rendezvous code and switching to stop_machine, commit 192d8857427d ("x86, mtrr: use stop_machine APIs for doing MTRR rendezvous") completely dropped mtrr setting code on !CONFIG_SMP breaking MTRR settting on UP. Fix it by removing the incorrect CONFIG_SMP. Signed-off-by: Tejun Heo Reported-by: Anders Eriksson Tested-and-acked-by: Suresh Siddha Acked-by: H. Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/main.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 08119a3..6b96110 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -149,7 +149,6 @@ struct set_mtrr_data { */ static int mtrr_rendezvous_handler(void *info) { -#ifdef CONFIG_SMP struct set_mtrr_data *data = info; /* @@ -171,7 +170,6 @@ static int mtrr_rendezvous_handler(void *info) } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } -#endif return 0; } -- cgit v1.1 From b4ca46e4e82a0a5976fe5eab85be585d75f8202f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Aug 2011 16:10:33 -0400 Subject: x86-32: Fix boot with CONFIG_X86_INVD_BUG entry_32.S contained a hardcoded alternative instruction entry, and the format changed in commit 59e97e4d6fbc ("x86: Make alternative instruction pointers relative"). Replace the hardcoded entry with the altinstruction_entry macro. This fixes the 32-bit boot with CONFIG_X86_INVD_BUG=y. Reported-and-tested-by: Arnaud Lacombe Signed-off-by: Andy Lutomirski Cc: Peter Anvin Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/entry_32.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5c1a9197..f3f6f53 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -873,12 +874,7 @@ ENTRY(simd_coprocessor_error) 661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" - .balign 4 - .long 661b - .long 663f - .word X86_FEATURE_XMM - .byte 662b-661b - .byte 664f-663f + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f .previous .section .altinstr_replacement,"ax" 663: pushl $do_simd_coprocessor_error -- cgit v1.1 From a94cc4e6c0a26a7c8f79a432ab2c89534aa674d5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 26 Aug 2011 12:20:59 +0100 Subject: sfi: table irq 0xFF means 'no interrupt' According to the SFI specification irq number 0xFF means device has no interrupt or interrupt attached via GPIO. Currently, we don't handle this special case and set irq field in *_board_info structs to 255. It leads to confusion in some drivers. Accelerometer driver tries to register interrupt 255, fails and prints "Cannot get IRQ" to dmesg. Signed-off-by: Kirill A. Shutemov Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- arch/x86/platform/mrst/mrst.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 7000e74b..58425ad 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -689,7 +689,9 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) irq_attr.trigger = 1; irq_attr.polarity = 1; io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); - } + } else + pentry->irq = 0; /* No irq */ + switch (pentry->type) { case SFI_DEV_TYPE_IPC: /* ID as IRQ is a hack that will go away */ -- cgit v1.1 From f5b940997397229975ea073679b03967932a541b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Aug 2011 18:03:11 -0400 Subject: All Arch: remove linkage for sys_nfsservctl system call The nfsservctl system call is now gone, so we should remove all linkage for it. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/unistd_64.h | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a0e866d..54edb207 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -672,7 +672,7 @@ ia32_sys_call_table: .quad sys32_vm86_warning /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll - .quad compat_sys_nfsservctl + .quad quiet_ni_syscall /* old nfsservctl */ .quad sys_setresgid16 /* 170 */ .quad sys_getresgid16 .quad sys_prctl diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d92641c..2010405 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -414,7 +414,7 @@ __SYSCALL(__NR_query_module, sys_ni_syscall) __SYSCALL(__NR_quotactl, sys_quotactl) #define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_nfsservctl) +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) /* reserved for LiS/STREAMS */ #define __NR_getpmsg 181 diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index fbb0a04..bc19be3 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long ptregs_vm86 .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl -- cgit v1.1 From 3b217116edaac634bf31e85c35708298059a8171 Mon Sep 17 00:00:00 2001 From: Duncan Sands Date: Tue, 30 Aug 2011 10:58:22 +0200 Subject: KVM: Fix instruction size issue in pvclock scaling Commit de2d1a524e94 ("KVM: Fix register corruption in pvclock_scale_delta") introduced a mul instruction that may have only a memory operand; the assembler therefore cannot select the correct size: pvclock.s:229: Error: no instruction mnemonic suffix given and no register operands; can't size instruction In this example the assembler is: #APP mul -48(%rbp) ; shrd $32, %rdx, %rax #NO_APP A simple solution is to use mulq. Signed-off-by: Duncan Sands Signed-off-by: Avi Kivity --- arch/x86/include/asm/pvclock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index a518c0a..c59cc97 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -44,7 +44,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif defined(__x86_64__) __asm__ ( - "mul %[mul_frac] ; shrd $32, %[hi], %[lo]" + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" : [lo]"=a"(product), [hi]"=d"(tmp) : "0"(delta), -- cgit v1.1 From 20afc60f892d285fde179ead4b24e6a7938c2f1b Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 30 Aug 2011 12:32:36 +0400 Subject: x86, perf: Check that current->mm is alive before getting user callchain An event may occur when an mm is already released. I added an event in dequeue_entity() and caught a panic with the following backtrace: [ 434.421110] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 [ 434.421258] IP: [] __get_user_pages_fast+0x9c/0x120 ... [ 434.421258] Call Trace: [ 434.421258] [] copy_from_user_nmi+0x51/0xf0 [ 434.421258] [] ? sched_clock_local+0x25/0x90 [ 434.421258] [] perf_callchain_user+0x128/0x170 [ 434.421258] [] ? __perf_event_header__init_id+0xed/0x100 [ 434.421258] [] perf_prepare_sample+0x200/0x280 [ 434.421258] [] __perf_event_overflow+0x1b8/0x290 [ 434.421258] [] ? tg_shares_up+0x0/0x670 [ 434.421258] [] ? walk_tg_tree+0x6a/0xb0 [ 434.421258] [] perf_swevent_overflow+0xc4/0xf0 [ 434.421258] [] do_perf_sw_event+0x1e0/0x250 [ 434.421258] [] perf_tp_event+0x44/0x70 [ 434.421258] [] ftrace_profile_sched_block+0xdf/0x110 [ 434.421258] [] dequeue_entity+0x2ad/0x2d0 [ 434.421258] [] dequeue_task_fair+0x1c/0x60 [ 434.421258] [] dequeue_task+0x9a/0xb0 [ 434.421258] [] deactivate_task+0x42/0xe0 [ 434.421258] [] thread_return+0x191/0x808 [ 434.421258] [] ? switch_task_namespaces+0x24/0x60 [ 434.421258] [] do_exit+0x464/0x910 [ 434.421258] [] do_group_exit+0x58/0xd0 [ 434.421258] [] sys_exit_group+0x17/0x20 [ 434.421258] [] system_call_fastpath+0x16/0x1b Signed-off-by: Andrey Vagin Signed-off-by: Peter Zijlstra Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1314693156-24131-1-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ee3abf..cfa62ec 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1900,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); + if (!current->mm) + return; + if (perf_callchain_user32(regs, entry)) return; -- cgit v1.1 From d312ae878b6aed3912e1acaaf5d0b2a9d08a4f11 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 19 Aug 2011 15:57:16 +0100 Subject: xen: use maximum reservation to limit amount of usable RAM Use the domain's maximum reservation to limit the amount of extra RAM for the memory balloon. This reduces the size of the pages tables and the amount of reserved low memory (which defaults to about 1/32 of the total RAM). On a system with 8 GiB of RAM with the domain limited to 1 GiB the kernel reports: Before: Memory: 627792k/4472000k available After: Memory: 549740k/11132224k available A increase of about 76 MiB (~1.5% of the unused 7 GiB). The reserved low memory is also reduced from 253 MiB to 32 MiB. The total additional usable RAM is 329 MiB. For dom0, this requires at patch to Xen ('x86: use 'dom0_mem' to limit the number of pages for dom0') (c/s 23790) CC: stable@kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 02ffd9e..ff3dfa1 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -183,6 +183,19 @@ static unsigned long __init xen_set_identity(const struct e820entry *list, PFN_UP(start_pci), PFN_DOWN(last)); return identity; } + +static unsigned long __init xen_get_max_pages(void) +{ + unsigned long max_pages = MAX_DOMAIN_PAGES; + domid_t domid = DOMID_SELF; + int ret; + + ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); + if (ret > 0) + max_pages = ret; + return min(max_pages, MAX_DOMAIN_PAGES); +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -291,6 +304,12 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + extra_limit = xen_get_max_pages(); + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); /* -- cgit v1.1 From d198d499148a0c64a41b3aba9e7dd43772832b91 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 1 Sep 2011 13:46:55 +0200 Subject: xen: x86_32: do not enable iterrupts when returning from exception in interrupt context If vmalloc page_fault happens inside of interrupt handler with interrupts disabled then on exit path from exception handler when there is no pending interrupts, the following code (arch/x86/xen/xen-asm_32.S:112): cmpw $0x0001, XEN_vcpu_info_pending(%eax) sete XEN_vcpu_info_mask(%eax) will enable interrupts even if they has been previously disabled according to eflags from the bounce frame (arch/x86/xen/xen-asm_32.S:99) testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) setz XEN_vcpu_info_mask(%eax) Solution is in setting XEN_vcpu_info_mask only when it should be set according to cmpw $0x0001, XEN_vcpu_info_pending(%eax) but not clearing it if there isn't any pending events. Reproducer for bug is attached to RHBZ 707552 CC: stable@kernel.org Signed-off-by: Igor Mammedov Acked-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/xen-asm_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 22a2093..b040b0e 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -113,11 +113,13 @@ xen_iret_start_crit: /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback + * jump back into xen_hypervisor_callback. Otherwise do not + * touch XEN_vcpu_info_mask. */ - sete XEN_vcpu_info_mask(%eax) + jne 1f + movb $1, XEN_vcpu_info_mask(%eax) - popl %eax +1: popl %eax /* * From this point on the registers are restored and the stack -- cgit v1.1 From ed467e69f16e6b480e2face7bc5963834d025f91 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 1 Sep 2011 09:48:27 -0400 Subject: xen/smp: Warn user why they keel over - nosmp or noapic and what to use instead. We have hit a couple of customer bugs where they would like to use those parameters to run an UP kernel - but both of those options turn of important sources of interrupt information so we end up not being able to boot. The correct way is to pass in 'dom0_max_vcpus=1' on the Xen hypervisor line and the kernel will patch itself to be a UP kernel. Fixes bug: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=637308 CC: stable@kernel.org Acked-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e79dbb9..d4fc6d4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,6 +32,7 @@ #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -207,6 +208,15 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) unsigned cpu; unsigned int i; + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); + } xen_init_lock_cpu(0); smp_store_cpu_info(0); -- cgit v1.1 From f10cd522c5fbfec9ae3cc01967868c9c2401ed23 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 6 Sep 2011 17:41:47 +0100 Subject: xen: disable PV spinlocks on HVM PV spinlocks cannot possibly work with the current code because they are enabled after pvops patching has already been done, and because PV spinlocks use a different data structure than native spinlocks so we cannot switch between them dynamically. A spinlock that has been taken once by the native code (__ticket_spin_lock) cannot be taken by __xen_spin_lock even after it has been released. Reported-and-Tested-by: Stefan Bader Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d4fc6d4..041d4fe 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -532,7 +532,6 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) WARN_ON(xen_smp_intr_init(0)); xen_init_lock_cpu(0); - xen_init_spinlocks(); } static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) -- cgit v1.1 From 5307f6d5fb12fd01f9f321bc4a8fd77e74858647 Mon Sep 17 00:00:00 2001 From: Shyam Iyer Date: Thu, 8 Sep 2011 16:41:17 -0500 Subject: Fix pointer dereference before call to pcie_bus_configure_settings Commit b03e7495a862 ("PCI: Set PCI-E Max Payload Size on fabric") introduced a potential NULL pointer dereference in calls to pcie_bus_configure_settings due to attempts to access pci_bus self variables when the self pointer is NULL. To correct this, verify that the self pointer in pci_bus is non-NULL before dereferencing it. Reported-by: Stanislaw Gruszka Signed-off-by: Shyam Iyer Signed-off-by: Jon Mason Acked-by: Jesse Barnes Signed-off-by: Linus Torvalds --- arch/x86/pci/acpi.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c953302..039d913 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -365,8 +365,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) */ if (bus) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child, child->self->pcie_mpss); + list_for_each_entry(child, &bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + + pcie_bus_configure_settings(child, self->pcie_mpss); + } } if (!bus) -- cgit v1.1 From e3b73c4a25e9a5705b4ef28b91676caf01f9bc9f Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 13 Sep 2011 10:17:32 -0400 Subject: xen/e820: if there is no dom0_mem=, don't tweak extra_pages. The patch "xen: use maximum reservation to limit amount of usable RAM" (d312ae878b6aed3912e1acaaf5d0b2a9d08a4f11) breaks machines that do not use 'dom0_mem=' argument with: reserve RAM buffer: 000000133f2e2000 - 000000133fffffff (XEN) mm.c:4976:d0 Global bit is set to kernel page fffff8117e (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: ... The reason being that the last E820 entry is created using the 'extra_pages' (which is based on how many pages have been freed). The mentioned git commit sets the initial value of 'extra_pages' using a hypercall which returns the number of pages (if dom0_mem has been used) or -1 otherwise. If the later we return with MAX_DOMAIN_PAGES as basis for calculation: return min(max_pages, MAX_DOMAIN_PAGES); and use it: extra_limit = xen_get_max_pages(); if (extra_limit >= max_pfn) extra_pages = extra_limit - max_pfn; else extra_pages = 0; which means we end up with extra_pages = 128GB in PFNs (33554432) - 8GB in PFNs (2097152, on this specific box, can be larger or smaller), and then we add that value to the E820 making it: Xen: 00000000ff000000 - 0000000100000000 (reserved) Xen: 0000000100000000 - 000000133f2e2000 (usable) which is clearly wrong. It should look as so: Xen: 00000000ff000000 - 0000000100000000 (reserved) Xen: 0000000100000000 - 000000027fbda000 (usable) Naturally this problem does not present itself if dom0_mem=max:X is used. CC: stable@kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ff3dfa1..09688eb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -305,10 +305,12 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); extra_limit = xen_get_max_pages(); - if (extra_limit >= max_pfn) - extra_pages = extra_limit - max_pfn; - else - extra_pages = 0; + if (max_pfn + extra_pages > extra_limit) { + if (extra_limit > max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + } extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); -- cgit v1.1 From 61cca2fab7ecba18f9b9680cd736ef5fa82ad3b1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 15 Sep 2011 08:52:40 +0100 Subject: xen/i386: follow-up to "replace order-based range checking of M2P table by linear one" The numbers obtained from the hypervisor really can't ever lead to an overflow here, only the original calculation going through the order of the range could have. This avoids the (as Jeremy points outs) somewhat ugly NULL-based calculation here. Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 24abc1f..a3872f7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1721,10 +1721,8 @@ void __init xen_setup_machphys_mapping(void) machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; } #ifdef CONFIG_X86_32 - if ((machine_to_phys_mapping + machine_to_phys_nr) - < machine_to_phys_mapping) - machine_to_phys_nr = (unsigned long *)NULL - - machine_to_phys_mapping; + WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) + < machine_to_phys_mapping); #endif } -- cgit v1.1 From a7f934d4f16144cb9521b62e9b8c9ac0118097da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Sep 2011 13:28:33 -0700 Subject: asm alternatives: remove incorrect alignment notes On x86-64, they were just wasteful: with the explicitly added (now unnecessary) padding, the size of the alternatives structure was 16 bytes, and an alignment of 8 bytes didn't hurt much. However, it was still silly, since the natural size and alignment for the structure is actually just 12 bytes, 4-byte aligned since commit 59e97e4d6fbc ("x86: Make alternative instruction pointers relative"). So removing the padding, and removing the extra alignment is just a good idea. On x86-32, the alignment of 4 bytes was correct, but was incorrectly hardcoded as 8 bytes in . That header file had used to be an x86-64 only header file, but various unification efforts have made it be used for x86-32 too (ie the unification of rwlock and rwsem). That in turn caused x86-32 boot failures, because the extra alignment would result in random zero-filled words in the altinstructions section, causing oopses early at boot when doing alternative instruction replacement. So just remove all the alignment noise entirely. It's wrong, and it's unnecessary. The section itself is already properly aligned by the linker scripts, and all additions to the section had better be of the proper 12-byte format, keeping it aligned. So if the align directive were to ever make a difference, that would be an indication of a serious bug to begin with. Reported-by: Werner Landgraf Acked-by: Andrew Lutomirski Signed-off-by: Linus Torvalds --- arch/x86/include/asm/alternative-asm.h | 1 - arch/x86/include/asm/alternative.h | 4 ---- arch/x86/include/asm/cpufeature.h | 2 -- 3 files changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 4554cc6..091508b 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -16,7 +16,6 @@ #endif .macro altinstruction_entry orig alt feature orig_len alt_len - .align 8 .long \orig - . .long \alt - . .word \feature diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 23fb6d7..37ad100 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,9 +48,6 @@ struct alt_instr { u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction, <= instrlen */ -#ifdef CONFIG_X86_64 - u32 pad2; -#endif }; extern void alternative_instructions(void); @@ -83,7 +80,6 @@ static inline int alternatives_text_reserved(void *start, void *end) \ "661:\n\t" oldinstr "\n662:\n" \ ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ " .long 661b - .\n" /* label */ \ " .long 663f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4258aac..88b23a4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -332,7 +332,6 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) asm goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" - _ASM_ALIGN "\n" " .long 1b - .\n" " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ @@ -350,7 +349,6 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) asm volatile("1: movb $0,%0\n" "2:\n" ".section .altinstructions,\"a\"\n" - _ASM_ALIGN "\n" " .long 1b - .\n" " .long 3f - .\n" " .word %P1\n" /* feature bit */ -- cgit v1.1 From 47997d756aa2a84ab577e1b0383cc12d582fc69c Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 21 Sep 2011 16:08:03 +0200 Subject: x86/rtc: Don't recursively acquire rtc_lock A deadlock was introduced on x86 in commit ef68c8f87ed1 ("x86: Serialize EFI time accesses on rtc_lock") because efi_get_time() and friends can be called with rtc_lock already held by read_persistent_time(), e.g.: timekeeping_init() read_persistent_clock() <-- acquire rtc_lock efi_get_time() phys_efi_get_time() <-- acquire rtc_lock To fix this let's push the locking down into the get_wallclock() and set_wallclock() implementations. Only the clock implementations that access the x86 RTC directly need to acquire rtc_lock, so it makes sense to push the locking down into the rtc, vrtc and efi code. The virtualization implementations don't require rtc_lock to be held because they provide their own serialization. Signed-off-by: Matt Fleming Acked-by: Jan Beulich Acked-by: Avi Kivity [for the virtualization aspect] Cc: "H. Peter Anvin" Cc: Zhang Rui Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/rtc.c | 23 ++++++++++++----------- arch/x86/platform/mrst/vrtc.c | 9 +++++++++ 2 files changed, 21 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 3f2ad26..ccdbc16 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -42,8 +42,11 @@ int mach_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes, cmos_minutes; unsigned char save_control, save_freq_select; + unsigned long flags; int retval = 0; + spin_lock_irqsave(&rtc_lock, flags); + /* tell the clock it's being set */ save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); @@ -93,12 +96,17 @@ int mach_set_rtc_mmss(unsigned long nowtime) CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + spin_unlock_irqrestore(&rtc_lock, flags); + return retval; } unsigned long mach_get_cmos_time(void) { unsigned int status, year, mon, day, hour, min, sec, century = 0; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); /* * If UIP is clear, then we have >= 244 microseconds before @@ -125,6 +133,8 @@ unsigned long mach_get_cmos_time(void) status = CMOS_READ(RTC_CONTROL); WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); + spin_unlock_irqrestore(&rtc_lock, flags); + if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { sec = bcd2bin(sec); min = bcd2bin(min); @@ -169,24 +179,15 @@ EXPORT_SYMBOL(rtc_cmos_write); int update_persistent_clock(struct timespec now) { - unsigned long flags; - int retval; - - spin_lock_irqsave(&rtc_lock, flags); - retval = x86_platform.set_wallclock(now.tv_sec); - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; + return x86_platform.set_wallclock(now.tv_sec); } /* not static: needed by APM */ void read_persistent_clock(struct timespec *ts) { - unsigned long retval, flags; + unsigned long retval; - spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); - spin_unlock_irqrestore(&rtc_lock, flags); ts->tv_sec = retval; ts->tv_nsec = 0; diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 73d70d6..6d5dbcd 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -58,8 +58,11 @@ EXPORT_SYMBOL_GPL(vrtc_cmos_write); unsigned long vrtc_get_time(void) { u8 sec, min, hour, mday, mon; + unsigned long flags; u32 year; + spin_lock_irqsave(&rtc_lock, flags); + while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP)) cpu_relax(); @@ -70,6 +73,8 @@ unsigned long vrtc_get_time(void) mon = vrtc_cmos_read(RTC_MONTH); year = vrtc_cmos_read(RTC_YEAR); + spin_unlock_irqrestore(&rtc_lock, flags); + /* vRTC YEAR reg contains the offset to 1960 */ year += 1960; @@ -83,8 +88,10 @@ unsigned long vrtc_get_time(void) int vrtc_set_mmss(unsigned long nowtime) { int real_sec, real_min; + unsigned long flags; int vrtc_min; + spin_lock_irqsave(&rtc_lock, flags); vrtc_min = vrtc_cmos_read(RTC_MINUTES); real_sec = nowtime % 60; @@ -95,6 +102,8 @@ int vrtc_set_mmss(unsigned long nowtime) vrtc_cmos_write(real_sec, RTC_SECONDS); vrtc_cmos_write(real_min, RTC_MINUTES); + spin_unlock_irqrestore(&rtc_lock, flags); + return 0; } -- cgit v1.1 From 41bc3186b3c92a4ca05e2aa14bb6272fb491e679 Mon Sep 17 00:00:00 2001 From: Zhao Jin Date: Mon, 19 Sep 2011 12:19:51 +0800 Subject: KVM: MMU: fix incorrect return of spte __update_clear_spte_slow should return original spte while the current code returns low half of original spte combined with high half of new spte. Signed-off-by: Zhao Jin Reviewed-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1c5b693..8e8da79 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -400,7 +400,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* xchg acts as a barrier before the setting of the high bits */ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); - orig.spte_high = ssptep->spte_high = sspte.spte_high; + orig.spte_high = ssptep->spte_high; + ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); return orig.spte; -- cgit v1.1 From 9be3be1f153e90ea4e1e5b6ed1d72a73d44318d1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:38 +0300 Subject: KVM: x86 emulator: fix Src2CL decode Src2CL decode (used for double width shifts) erronously decodes only bit 3 of %rcx, instead of bits 7:0. Fix by decoding %cl in its entirety. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6f08bc9..8b4cc5f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3603,7 +3603,7 @@ done_prefixes: break; case Src2CL: ctxt->src2.bytes = 1; - ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8; + ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff; break; case Src2ImmByte: rc = decode_imm(ctxt, &ctxt->src2, 1, true); -- cgit v1.1