diff options
Diffstat (limited to 'arch/x86')
104 files changed, 1949 insertions, 967 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef..13ffa5d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_PERF_COUNTERS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -742,7 +743,6 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC - select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y @@ -1913,25 +1913,26 @@ config DMAR_DEFAULT_ON recommended you say N here while the DMAR code remains experimental. -config DMAR_GFX_WA - def_bool y - prompt "Support for Graphics workaround" +config DMAR_BROKEN_GFX_WA + def_bool n + prompt "Workaround broken graphics drivers (going away soon)" depends on DMAR ---help--- Current Graphics drivers tend to use physical address for DMA and avoid using DMA APIs. Setting this config option permits the IOMMU driver to set a unity map for all the OS-visible memory. Hence the driver can continue - to use physical addresses for DMA. + to use physical addresses for DMA, at least until this + option is removed in the 2.6.32 kernel. config DMAR_FLOPPY_WA def_bool y depends on DMAR ---help--- - Floppy disk drivers are know to bypass DMA API calls + Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This workaround will setup a 1:1 mapping for the first - 16M to make floppy (an ISA device) work. + 16MiB to make floppy (an ISA device) work. config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e2ff504..f8ed065 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,7 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index d660be4..49e0c18 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode) ireg.al = mode; /* AH=0x00 Set Video Mode */ intcall(0x10, &ireg, NULL); - ireg.ah = 0x0f; /* Get Current Video Mode */ intcall(0x10, &ireg, &oreg); do_restore = 1; /* Assume video contents were lost */ /* Not all BIOSes are clean with the top bit */ - new_mode = ireg.al & 0x7f; + new_mode = oreg.al & 0x7f; if (new_mode == mode) return 0; /* Mode change OK */ diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index c700147..275dd17 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -45,7 +45,7 @@ static int vesa_probe(void) ireg.di = (size_t)&vginfo; intcall(0x10, &ireg, &oreg); - if (ireg.ax != 0x004f || + if (oreg.ax != 0x004f || vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ @@ -70,7 +70,7 @@ static int vesa_probe(void) ireg.di = (size_t)&vminfo; intcall(0x10, &ireg, &oreg); - if (ireg.ax != 0x004f) + if (oreg.ax != 0x004f) continue; if ((vminfo.mode_attr & 0x15) == 0x05) { diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 2503d4e..dc5a667 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -19,7 +19,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable @@ -28,7 +31,10 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v) return atomic_add_return(-i, v); } -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is already a given value @@ -250,45 +263,12 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) /* An 64bit atomic type */ typedef struct { - unsigned long long counter; + u64 __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } -/** - * atomic64_read - read atomic64 variable - * @ptr: pointer of type atomic64_t - * - * Atomically reads the value of @v. - * Doesn't imply a read memory barrier. - */ -#define __atomic64_read(ptr) ((ptr)->counter) - -static inline unsigned long long -cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) -{ - asm volatile( - - LOCK_PREFIX "cmpxchg8b (%[ptr])\n" - - : "=A" (old) - - : [ptr] "D" (ptr), - "A" (old), - "b" (ll_low(new)), - "c" (ll_high(new)) - - : "memory"); - - return old; -} - -static inline unsigned long long -atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, - unsigned long long new_val) -{ - return cmpxchg8b(&ptr->counter, old_val, new_val); -} +extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); /** * atomic64_xchg - xchg atomic64 variable @@ -298,18 +278,7 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, * Atomically xchgs the value of @ptr to @new_val and returns * the old value. */ - -static inline unsigned long long -atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) -{ - unsigned long long old_val; - - do { - old_val = atomic_read(ptr); - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); - - return old_val; -} +extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); /** * atomic64_set - set atomic64 variable @@ -318,10 +287,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) * * Atomically sets the value of @ptr to @new_val. */ -static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) -{ - atomic64_xchg(ptr, new_val); -} +extern void atomic64_set(atomic64_t *ptr, u64 new_val); /** * atomic64_read - read atomic64 variable @@ -329,17 +295,30 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) * * Atomically reads the value of @ptr and returns it. */ -static inline unsigned long long atomic64_read(atomic64_t *ptr) +static inline u64 atomic64_read(atomic64_t *ptr) { - unsigned long long curr_val; - - do { - curr_val = __atomic64_read(ptr); - } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); - - return curr_val; + u64 res; + + /* + * Note, we inline this atomic64_t primitive because + * it only clobbers EAX/EDX and leaves the others + * untouched. We also (somewhat subtly) rely on the + * fact that cmpxchg8b returns the current 64-bit value + * of the memory location we are touching: + */ + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "=&A" (res) + : "m" (*ptr) + ); + + return res; } +extern u64 atomic64_read(atomic64_t *ptr); + /** * atomic64_add_return - add and return * @delta: integer value to add @@ -347,34 +326,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr) * * Atomically adds @delta to @ptr and returns @delta + *@ptr */ -static inline unsigned long long -atomic64_add_return(unsigned long long delta, atomic64_t *ptr) -{ - unsigned long long old_val, new_val; - - do { - old_val = atomic_read(ptr); - new_val = old_val + delta; - - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); - - return new_val; -} - -static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) -{ - return atomic64_add_return(-delta, ptr); -} +extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); -static inline long atomic64_inc_return(atomic64_t *ptr) -{ - return atomic64_add_return(1, ptr); -} - -static inline long atomic64_dec_return(atomic64_t *ptr) -{ - return atomic64_sub_return(1, ptr); -} +/* + * Other variants with different arithmetic operators: + */ +extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); +extern u64 atomic64_inc_return(atomic64_t *ptr); +extern u64 atomic64_dec_return(atomic64_t *ptr); /** * atomic64_add - add integer to atomic64 variable @@ -383,10 +342,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr) * * Atomically adds @delta to @ptr. */ -static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) -{ - atomic64_add_return(delta, ptr); -} +extern void atomic64_add(u64 delta, atomic64_t *ptr); /** * atomic64_sub - subtract the atomic64 variable @@ -395,10 +351,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) * * Atomically subtracts @delta from @ptr. */ -static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) -{ - atomic64_add(-delta, ptr); -} +extern void atomic64_sub(u64 delta, atomic64_t *ptr); /** * atomic64_sub_and_test - subtract value from variable and test result @@ -409,13 +362,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) * true if the result is zero, or false for all * other cases. */ -static inline int -atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) -{ - unsigned long long old_val = atomic64_sub_return(delta, ptr); - - return old_val == 0; -} +extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); /** * atomic64_inc - increment atomic64 variable @@ -423,10 +370,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) * * Atomically increments @ptr by 1. */ -static inline void atomic64_inc(atomic64_t *ptr) -{ - atomic64_add(1, ptr); -} +extern void atomic64_inc(atomic64_t *ptr); /** * atomic64_dec - decrement atomic64 variable @@ -434,10 +378,7 @@ static inline void atomic64_inc(atomic64_t *ptr) * * Atomically decrements @ptr by 1. */ -static inline void atomic64_dec(atomic64_t *ptr) -{ - atomic64_sub(1, ptr); -} +extern void atomic64_dec(atomic64_t *ptr); /** * atomic64_dec_and_test - decrement and test @@ -447,10 +388,7 @@ static inline void atomic64_dec(atomic64_t *ptr) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic64_dec_and_test(atomic64_t *ptr) -{ - return atomic64_sub_and_test(1, ptr); -} +extern int atomic64_dec_and_test(atomic64_t *ptr); /** * atomic64_inc_and_test - increment and test @@ -460,10 +398,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic64_inc_and_test(atomic64_t *ptr) -{ - return atomic64_sub_and_test(-1, ptr); -} +extern int atomic64_inc_and_test(atomic64_t *ptr); /** * atomic64_add_negative - add and test if negative @@ -474,13 +409,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int -atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) -{ - long long old_val = atomic64_add_return(delta, ptr); - - return old_val < 0; -} +extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); #include <asm-generic/atomic-long.h> #endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 0d63602..d605dc2 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -18,7 +18,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable @@ -27,7 +30,10 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -#define atomic64_read(v) ((v)->counter) +static inline long atomic64_read(const atomic64_t *v) +{ + return v->counter; +} /** * atomic64_set - set atomic64 variable @@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * * Atomically sets the value of @v to @i. */ -#define atomic64_set(v, i) (((v)->counter) = (i)) +static inline void atomic64_set(atomic64_t *v, long i) +{ + v->counter = i; +} /** * atomic64_add - add integer to atomic64 variable @@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v) #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) -#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic64_xchg(atomic64_t *v, long new) +{ + return xchg(&v->counter, new); +} -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline long atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is a given value diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 418e632..7a10659 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -8,7 +8,7 @@ #ifdef __KERNEL__ -#include <asm/page_types.h> +#include <asm/pgtable_types.h> /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ @@ -16,10 +16,10 @@ & ~(CONFIG_PHYSICAL_ALIGN - 1)) /* Minimum kernel alignment, as a power of two */ -#ifdef CONFIG_x86_64 +#ifdef CONFIG_X86_64 #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index edc90f2..8406ed7 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size) ioremap_cache(addr, size) +#define efi_ioremap(addr, size, type) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) -extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); +extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, + u32 type); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 2d81af3..7b2d71d 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -111,12 +111,9 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE1, + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ __end_of_permanent_fixed_addresses, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. @@ -129,6 +126,9 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index daf866e..330ee80 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -161,6 +161,7 @@ extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); +extern void ioapic_insert_resources(void); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); @@ -180,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin, #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } +static inline void ioapic_insert_resources(void) { } static inline void probe_nr_irqs_gsi(void) { } #endif diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2bdab21..c6ccbe7 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void) { unsigned long flags; + /* + * Note: this needs to be "=r" not "=rm", because we have the + * stack offset from what gcc expects at the time the "pop" is + * executed, and so a memory reference with respect to the stack + * would end up using the wrong address. + */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" - : "=g" (flags) + : "=r" (flags) : /* no input */ : "memory"); diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389c..5136dad 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,7 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M (-2M when PAE is activated) for ease of mapping - * into the guest (one PTE page). */ +/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ #ifdef CONFIG_X86_PAE #define SWITCHER_ADDR 0xFFE00000 #else diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index d31c4a6..ba0eed8 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -30,27 +30,27 @@ #include <asm/hw_irq.h> #include <asm/kvm_para.h> -/*G:031 But first, how does our Guest contact the Host to ask for privileged +/*G:030 + * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. + * We use the KVM hypercall mechanism, though completely different hypercall + * numbers. Seventeen hypercalls are available: the hypercall number is put in + * the %eax register, and the arguments (when required) are placed in %ebx, + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". */ -/*:*/ + * definition of a gentleman: "someone who is only rude intentionally". +:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx and esi - * in struct lguest_regs */ + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ unsigned long arg0, arg1, arg2, arg3, arg4; }; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5..6be7fc2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -246,10 +246,6 @@ #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) -/* Intel Model 6 */ -#define MSR_P6_EVNTSEL0 0x00000186 -#define MSR_P6_EVNTSEL1 0x00000187 - /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c972644..c86e5ed4 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -72,7 +72,6 @@ void lapic_watchdog_stop(void); int lapic_watchdog_init(unsigned nmi_hz); int lapic_wd_event(unsigned nmi_hz); unsigned lapic_adjust_nmi_hz(unsigned hz); -int lapic_watchdog_ok(void); void disable_lapic_nmi_watchdog(void); void enable_lapic_nmi_watchdog(void); void stop_nmi(void); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 927958d..1ff685c 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -91,7 +91,7 @@ extern void pci_iommu_alloc(void); #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) +#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ dma_addr_t ADDR_NAME; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 02ecb30..103f1dd 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -42,6 +42,7 @@ #else /* ...!ASSEMBLY */ +#include <linux/kernel.h> #include <linux/stringify.h> #ifdef CONFIG_SMP @@ -155,6 +156,15 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); +#ifdef CONFIG_NEED_MULTIPLE_NODES +void *pcpu_lpage_remapped(void *kaddr); +#else +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 5fb33e1..fa64e40 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -87,6 +87,9 @@ union cpuid10_edx { #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); + +#define PERF_COUNTER_INDEX_OFFSET 0 + #else static inline void init_hw_perf_counters(void) { } static inline void perf_counters_lapic_init(void) { } diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd14c54..0e8c2a0 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) __free_page(pte); } -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) @@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long adddress) +{ + ___pmd_free_tlb(tlb, pmd); +} #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); @@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) free_page((unsigned long)pud); } -extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3cc06e3..1674807 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -2,6 +2,7 @@ #define _ASM_X86_PGTABLE_H #include <asm/page.h> +#include <asm/e820.h> #include <asm/pgtable_types.h> @@ -269,10 +270,17 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -static inline int is_new_memtype_allowed(unsigned long flags, - unsigned long new_flags) +static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, + unsigned long flags, + unsigned long new_flags) { /* + * PAT type is always WB for ISA. So no need to check. + */ + if (is_ISA_range(paddr, paddr + size - 1)) + return 1; + + /* * Certain new memtypes are not allowed with certain * requested memtype: * - request is uncached, return cannot be write-back diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 49fb3ec..621f56d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -22,7 +22,14 @@ extern int reboot_force; long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1)) -#define round_down(x, y) ((x) & ~((y) - 1)) +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x,y) ((__typeof__(x))((y)-1)) +#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1) +#define round_down(x,y) ((x) & ~__round_mask(x,y)) #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b7e5db8..4e77853 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f517944..cf86a5e 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -3,6 +3,8 @@ extern int kstack_depth_to_print; +int x86_is_stack_id(int id, char *name); + /* Generic stack tracer with callbacks */ struct stacktrace_ops { diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b078352..fad7d40 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 20e6a79..d2c6c93 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -212,9 +212,9 @@ extern int __get_user_bad(void); : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") #else #define __put_user_asm_u64(x, ptr, retval, errret) \ - __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) + __put_user_asm(x, ptr, retval, "q", "", "er", errret) #define __put_user_asm_ex_u64(x, addr) \ - __put_user_asm_ex(x, addr, "q", "", "Zr") + __put_user_asm_ex(x, addr, "q", "", "er") #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) #endif diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8cc6873..db24b21 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) ret, "l", "k", "ir", 4); return ret; case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; case 10: __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 10); + ret, "q", "", "er", 10); if (unlikely(ret)) return ret; asm("":::"memory"); @@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) return ret; case 16: __put_user_asm(*(u64 *)src, (u64 __user *)dst, - ret, "q", "", "ir", 16); + ret, "q", "", "er", 16); if (unlikely(ret)) return ret; asm("":::"memory"); __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, - ret, "q", "", "ir", 8); + ret, "q", "", "er", 8); return ret; } default: diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index bddd44f..80e2984 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -133,7 +133,7 @@ struct bau_msg_payload { * see table 4.2.3.0.1 in broacast_assist spec. */ struct bau_msg_header { - unsigned int dest_subnodeid:6; /* must be zero */ + unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ /* bits 20:6 */ /* first bit in node_map */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 341070f..77a6850 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ - ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) + (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) #define UV_APIC_PNODE_SHIFT 6 @@ -327,6 +327,7 @@ struct uv_blade_info { unsigned short nr_possible_cpus; unsigned short nr_online_cpus; unsigned short pnode; + short memory_nid; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; @@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid) return uv_blade_info[bid].pnode; } +/* Nid of memory node on blade. -1 if no blade-local memory */ +static inline int uv_blade_to_memory_nid(int bid) +{ + return uv_blade_info[bid].memory_nid; +} + /* Determine the number of possible cpus on a blade */ static inline int uv_blade_nr_possible_cpus(int bid) { diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6c327b8..430d5b2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,6 +26,8 @@ CFLAGS_tsc.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n +GCOV_PROFILE_tsc.o := n +GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9372f04..6c99f50 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1192,7 +1192,7 @@ out: return 0; } -struct notifier_block device_nb = { +static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; @@ -1763,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) - return 0; + return NULL; paddr = virt_to_phys(virt_addr); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 10b2acc..c1b17e9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -472,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) if (iommu->evt_buf == NULL) return NULL; + iommu->evt_buf_size = EVT_BUFFER_SIZE; + return iommu->evt_buf; } @@ -691,6 +693,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; devid_to = e->ext >> 8; + set_dev_entry_from_acpi(iommu, devid , e->flags, 0); set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; @@ -749,11 +752,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - amd_iommu_alias_table[dev_i], - flags, ext_flags); + set_dev_entry_from_acpi(iommu, + devid_to, flags, ext_flags); + } + set_dev_entry_from_acpi(iommu, dev_i, + flags, ext_flags); } break; default: diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8c7c042..0a1c283 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -140,7 +140,6 @@ int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ static int x2apic_preenabled; -static int disable_x2apic; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { @@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str) return 0; } - disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 69328ac..8952a58 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, return ret && es7000_apic_is_cluster(); } -struct apic apic_es7000_cluster = { +/* We've been warned by a false positive warning.Use __refdata to keep calm. */ +struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4d0216f..d2ed6c5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1716,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void) return; } -__apicdebuginit(void) print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_field(int base) { - unsigned int v; - int i, j; + int i; if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + + printk(KERN_CONT "\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1745,7 +1739,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) if (apic_verbosity == APIC_QUIET) return; - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); @@ -1786,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); + print_APIC_field(APIC_ISR); printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); + print_APIC_field(APIC_TMR); printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); + print_APIC_field(APIC_IRR); if (APIC_INTEGRATED(ver)) { /* !82489DX */ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ @@ -3799,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + return irq; } @@ -4187,28 +4184,20 @@ fake_ioapic_page: } } -static int __init ioapic_insert_resources(void) +void __init ioapic_insert_resources(void) { int i; struct resource *r = ioapic_resources; if (!r) { - if (nr_ioapics > 0) { + if (nr_ioapics > 0) printk(KERN_ERR "IO APIC resources couldn't be allocated.\n"); - return -1; - } - return 0; + return; } for (i = 0; i < nr_ioapics; i++) { insert_resource(&iomem_resource, r); r++; } - - return 0; } - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445..6ef00ba 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; + if (WARN_ONCE(!mask, "empty IPI mask")) + return; + local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 533e59c..ca96e68 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void) (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); } -struct apic apic_numaq = { +/* Use __refdata to keep false positive warning calm. */ +struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8e4cbb2..a5371ec 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled(); } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } /* @@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a284359..a8989aa 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) @@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 096d19a..6011593 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -46,7 +46,7 @@ static int early_get_nodeid(void) return node_id.s.node_id; } -static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { if (!strcmp(oem_table_id, "UVL")) @@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -struct apic apic_x2apic_uv_x = { +struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = NULL, @@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = { .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .irq_dest_mode = 0, /* physical */ .target_cpus = uv_target_cpus, .disable_esr = 0, @@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } -static __init void map_low_mmrs(void) -{ - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); -} - enum map_type {map_wb, map_uc}; static __init void map_high(char *id, unsigned long base, int shift, @@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } -static __init void map_config_high(int max_pnode) -{ - union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; - int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; - - cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); - if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); -} - -static __init void map_mmr_high(int max_pnode) -{ - union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; - int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; - - mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); - if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); -} - static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -566,8 +540,6 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; - map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -591,6 +563,8 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) + uv_blade_info[blade].memory_nid = -1; get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); @@ -629,6 +603,9 @@ void __init uv_system_init(void) lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; + /* Any node on the blade, else will contain -1. */ + uv_blade_info[blade].memory_nid = nid; + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; @@ -662,11 +639,10 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; + max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); - map_mmr_high(max_pnode); - map_config_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9..442b550 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -811,7 +811,7 @@ static int apm_do_idle(void) u8 ret = 0; int idled = 0; int polling; - int err; + int err = 0; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3efcb2b..c1f253d 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg endif +# Make sure load_percpu_segment has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_common.o := $(nostackp) + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8..63fddcd0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; #endif } @@ -354,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) /* check CPU config space for extended APIC ID */ - if (c->x86 >= 0xf) { + if (cpu_has_apic && c->x86 >= 0xf) { unsigned int val; val = read_pci_config(0, 24, 0, 0x68); if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) @@ -398,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + */ + if (c->x86_model < 0x14) + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b26d4d..5ce60a8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -848,9 +848,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif - - /* Cap the iomem address space to what is addressable on all CPUs */ - iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 81cbe64..2a50ef8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(data, reqvid)) + if (core_voltage_pre_transition(data, reqvid, reqfid)) return 1; if (core_frequency_transition(data, reqfid)) @@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, /* Phase 1 - core voltage transition ... setup voltage */ static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid) + u32 reqvid, u32 reqfid) { u32 rvosteps = data->rvo; u32 savefid = data->currfid; - u32 maxvid, lo; + u32 maxvid, lo, rvomult = 1; dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); @@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { if (data->currvid == maxvid) { rvosteps = 0; } else { @@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) u32 vcoreqfid, vcocurrfid, vcofiddiff; u32 fid_interval, savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2: illegal lo-lo transition " - "0x%x 0x%x\n", reqfid, data->currfid); - return 1; - } - if (data->currfid == reqfid) { printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); @@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + while (vcofiddiff > 2) { (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); @@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX - "ignoring illegal change in lo freq table-%x to 0x%x\n", - data->currfid, fid); - return 1; - } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); @@ -1267,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { static const char ACPI_PSS_BIOS_BUG_MSG[] = KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; + FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; struct init_on_cpu init_on_cpu; int rc; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index c9c1190..02ce824 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -215,7 +215,8 @@ struct pst_s { #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) -static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de..0121304 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,14 +194,14 @@ static void print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk("\n"); + printk(KERN_CONT "\n"); } printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %llx ", m->addr); + printk(KERN_CONT "ADDR %llx ", m->addr); if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); + printk(KERN_CONT "MISC %llx ", m->misc); + printk(KERN_CONT "\n"); printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); @@ -209,13 +209,13 @@ static void print_mce(struct mce *m) static void print_mce_head(void) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + printk(KERN_EMERG "\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -1117,7 +1117,7 @@ static void mcheck_timer(unsigned long data) *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); t->expires = jiffies + *n; - add_timer(t); + add_timer_on(t, smp_processor_id()); } static void mce_do_trigger(struct work_struct *work) @@ -1226,8 +1226,13 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static int mce_cpu_quirks(struct cpuinfo_x86 *c) { + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + return -EOPNOTSUPP; + } + /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { if (c->x86 == 15 && banks > 4) { @@ -1273,11 +1278,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; if (mce_bootlog != 0) mce_panic_timeout = 30; + + return 0; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) @@ -1321,7 +1335,7 @@ static void mce_init_timer(void) return; setup_timer(t, mcheck_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); - add_timer(t); + add_timer_on(t, smp_processor_id()); } /* @@ -1338,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; - if (mce_cap_init() < 0) { + if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { mce_disabled = 1; return; } - mce_cpu_quirks(c); machine_check_vector = do_machine_check; @@ -1692,17 +1705,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, const char *buf, size_t siz) { char *p; - int len; strncpy(mce_helper, buf, sizeof(mce_helper)); mce_helper[sizeof(mce_helper)-1] = 0; - len = strlen(mce_helper); p = strchr(mce_helper, '\n'); - if (*p) + if (p) *p = 0; - return len; + return strlen(mce_helper) + !!p; } static ssize_t set_ignore_ce(struct sys_device *s, diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd1..5957a93 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -36,6 +36,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +static DEFINE_PER_CPU(bool, thermal_throttle_active); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -96,27 +97,33 @@ static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + return 1; + } + if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + return 1; } - return 1; + return 0; } #ifdef CONFIG_SYSFS diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef2..900332b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -55,6 +55,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + int apic; u64 max_period; u64 intel_ctrl; }; @@ -66,6 +67,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { }; /* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int event) +{ + return p6_perfmon_event_map[event]; +} + +/* + * Counter setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_COUNTER 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_COUNTER_MASK) + + return event & P6_EVNTSEL_MASK; +} + + +/* * Intel PerfMon v3. Used on Core2 and later. */ static const u64 intel_perfmon_event_map[] = @@ -401,7 +448,7 @@ static const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -567,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) @@ -581,9 +629,11 @@ static bool reserve_pmc_hardware(void) if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); @@ -598,10 +648,12 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; for (i = 0; i < x86_pmu.num_counters; i++) { @@ -611,6 +663,7 @@ static void release_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif } static void hw_perf_counter_destroy(struct perf_counter *counter) @@ -666,6 +719,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; + u64 config; int err; if (!x86_pmu_initialized()) @@ -701,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } counter->destroy = hw_perf_counter_destroy; @@ -718,14 +781,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (attr->config >= x86_pmu.max_events) return -EINVAL; + /* * The generic map: */ - hwc->config |= x86_pmu.event_map(attr->config); + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + hwc->config |= config; return 0; } +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_disable_all(void) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); @@ -767,6 +856,23 @@ void hw_perf_disable(void) return x86_pmu.disable_all(); } +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_enable_all(void) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); @@ -784,13 +890,13 @@ static void amd_pmu_enable_all(void) barrier(); for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_counter *counter = cpuc->counters[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - continue; + + val = counter->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -819,16 +925,13 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, - hwc->config); + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void @@ -836,13 +939,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; - int err; mask = 0xfULL << (idx * 4); rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; - err = checking_wrmsrl(hwc->config_base, ctrl_val); + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val = P6_NOP_COUNTER; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); } static inline void @@ -912,6 +1026,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + perf_counter_update_userpage(counter); + return ret; } @@ -941,6 +1057,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } +static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + + static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { @@ -957,8 +1086,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); - else - x86_pmu_disable_counter(hwc, idx); } static int @@ -969,13 +1096,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - /* - * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86_model == 28) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) @@ -1041,6 +1161,8 @@ try_generic: x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); + perf_counter_update_userpage(counter); + return 0; } @@ -1133,6 +1255,8 @@ static void x86_pmu_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); + + perf_counter_update_userpage(counter); } /* @@ -1177,6 +1301,49 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx, handled = 0; + u64 val; + + data.regs = regs; + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_counters); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + counter = cpuc->counters[idx]; + hwc = &counter->hw; + + val = x86_perf_counter_update(counter, hwc, idx); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + p6_pmu_disable_counter(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} /* * This handler is triggered by the local APIC, so the APIC IRQ handling @@ -1186,14 +1353,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_counters *cpuc; - int bit, cpu, loops; + int bit, loops; u64 ack, status; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); status = intel_pmu_get_status(); @@ -1250,14 +1416,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int cpu, idx, handled = 0; + int idx, handled = 0; u64 val; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) @@ -1300,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_counter_pending(void) { +#ifdef CONFIG_X86_LOCAL_APIC apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } void perf_counters_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes @@ -1335,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two * counters could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1354,6 +1525,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_counter, + .disable = p6_pmu_disable_counter, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_counters = 2, + /* + * Counters have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a counter for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .counter_bits = 32, + .counter_mask = (1ULL << 32) - 1, +}; + static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1366,6 +1564,7 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -1389,10 +1588,43 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; +static int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + if (!cpu_has_apic) { + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; + } + + return 0; +} + static int intel_pmu_init(void) { union cpuid10_edx edx; @@ -1401,8 +1633,14 @@ static int intel_pmu_init(void) unsigned int ebx; int version; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { return -ENODEV; + } + } /* * Check whether the Architectural PerfMon supports @@ -1428,8 +1666,6 @@ static int intel_pmu_init(void) */ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - /* * Install the hw-cache-events table: */ @@ -1499,21 +1735,22 @@ void __init init_hw_perf_counters(void) pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; } perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_counter_mask; perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); @@ -1563,6 +1800,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); static void @@ -1578,7 +1816,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - /* Process all stacks: */ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + return 0; } @@ -1586,6 +1826,9 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + if (reliable) callchain_store(entry, addr); } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 5c481f6..e60ed74 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -803,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz) wd_ops->rearm(wd, nmi_hz); return 1; } - -int lapic_watchdog_ok(void) -{ - return wd_ops != NULL; -} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa..c8405718 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,6 +22,7 @@ #include "dumpstack.h" int panic_on_unrecovered_nmi; +int panic_on_io_nmi; unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d593cd1..bca5fba 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -19,6 +19,12 @@ #include "dumpstack.h" +/* Just a stub for now */ +int x86_is_stack_id(int id, char *name) +{ + return 0; +} + void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d35db59..54b0a32 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -19,10 +19,8 @@ #include "dumpstack.h" -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { + +static char x86_stack_ids[][8] = { [DEBUG_STACK - 1] = "#DB", [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", @@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; + +int x86_is_stack_id(int id, char *name) +{ + return x86_stack_ids[id - 1] == name; +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ unsigned k; /* @@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, if (*usedp & (1U << k)) break; *usedp |= 1U << k; - *idp = ids[k]; + *idp = x86_stack_ids[k]; return (unsigned long *)end; } /* @@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, do { ++j; end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + x86_stack_ids[j][4] = '1' + + (j - N_EXCEPTION_STACKS); } while (stack < end - EXCEPTION_STKSZ); if (*usedp & (1U << j)) break; *usedp |= 1U << j; - *idp = ids[j]; + *idp = x86_stack_ids[j]; return (unsigned long *)end; } #endif diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7271fa3..5cb5725 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -627,10 +627,9 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); } #endif @@ -1383,6 +1382,8 @@ static unsigned long ram_alignment(resource_size_t pos) return 32*1024*1024; } +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + void __init e820_reserve_resources_late(void) { int i; @@ -1400,17 +1401,19 @@ void __init e820_reserve_resources_late(void) * avoid stolen RAM: */ for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; - resource_size_t start, end; + struct e820entry *entry = &e820.map[i]; + u64 start, end; if (entry->type != E820_RAM) continue; start = entry->addr + entry->size; - end = round_up(start, ram_alignment(start)); - if (start == end) + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) continue; - reserve_region_with_split(&iomem_resource, start, - end - 1, "RAM buffer"); + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); } } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 96f7ac0..fe26ba3 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -354,7 +354,7 @@ void __init efi_init(void) */ c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); if (c16) { - for (i = 0; i < sizeof(vendor) && *c16; ++i) + for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else @@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void) && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else - va = efi_ioremap(md->phys_addr, size); + va = efi_ioremap(md->phys_addr, size, md->type); md->virt_addr = (u64) (unsigned long) va; diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 22c3b78..ac0621a 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void) early_runtime_code_mapping_set_exec(0); } -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, + u32 type) { unsigned long last_map_pfn; + if (type == EFI_MEMORY_MAPPED_IO) + return ioremap(phys_addr, size); + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 8663afb..cc827ac 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -261,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * which will be freed later */ -#ifndef CONFIG_HOTPLUG_CPU -.section .init.text,"ax",@progbits -#endif +__CPUINIT #ifdef CONFIG_SMP ENTRY(startup_32_smp) @@ -602,7 +600,7 @@ ignore_int: #endif iret -.section .cpuinit.data,"wa" + __REFDATA .align 4 ENTRY(initial_code) .long i386_start_kernel diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e4..92b7703 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -187,7 +187,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THERMAL_VECTOR alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif -#ifdef CONFIG_X86_THRESHOLD +#ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a78ecad..c664d51 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void) state->mode = paravirt_get_lazy_mode(); } -static void paravirt_ops_setup(void) +static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; pv_info.paravirt_enabled = 1; diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 846510b..2a62d84 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id) static struct irqaction mfgptirq = { .handler = mfgpt_tick, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, .name = "mfgpt-timer" }; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 4763047..1a041bc 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -211,11 +211,11 @@ static __init int iommu_setup(char *p) #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) swiotlb = 1; +#endif if (!strncmp(p, "pt", 2)) { iommu_pass_through = 1; return 1; } -#endif gart_parse_options(p); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index cfd9f90..d2e56b8 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "falling back to iommu=soft.\n"); return -1; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 994dd6a..071166a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -519,16 +519,12 @@ static void c1e_idle(void) if (!cpumask_test_cpu(cpu, c1e_mask)) { cpumask_set_cpu(cpu, c1e_mask); /* - * Force broadcast so ACPI can not interfere. Needs - * to run with interrupts enabled as it uses - * smp_function_call. + * Force broadcast so ACPI can not interfere. */ - local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); - local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 4f9c55f..03801f2 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ +#elif defined(__x86_64__) __asm__ ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d1ce8..a06e8d1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/pm.h> #include <linux/efi.h> +#include <linux/dmi.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -17,7 +18,6 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -# include <linux/dmi.h> # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else @@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, + { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ + .callback = set_bios_reboot, + .ident = "CompuLab SBC-FITPC2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), + DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), + }, + }, { } }; @@ -396,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart); #endif /* CONFIG_X86_32 */ +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + }, + }, + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + }, + }, + { } +}; + +static int __init pci_reboot_init(void) +{ + dmi_check_system(pci_reboot_dmi_table); + return 0; +} +core_initcall(pci_reboot_init); + static inline void kb_wait(void) { int i; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index be5ae80..63f32d2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align) return ret; } +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -658,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, + { + /* + * AMI BIOS with low memory corruption was found on Intel DG45ID board. + * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * match only DMI_BOARD_NAME and see if there is more bad products + * with this vendor. + */ + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), + }, + }, #endif {} }; @@ -871,6 +898,8 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + init_gbpages(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); max_pfn_mapped = max_low_pfn_mapped; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f082..07d8191 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator + * Large page remap allocator * * This allocator uses PMD page as unit. A PMD page is allocated for * each cpu and each is remapped into vmalloc area using PMD mapping. @@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; - if (off >= pcpur_size) + if (off >= pcpul_size) return NULL; - return virt_to_page(pcpur_ptrs[cpu] + off); + return virt_to_page(pcpul_map[cpu].ptr + off); } -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - static struct vm_struct vm; - size_t ptrs_size, dyn_size; + size_t map_size, dyn_size; unsigned int cpu; + int i, j; ssize_t ret; - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. - */ - if (true || !cpu_has_pse || !pcpu_need_numa()) + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = nr_cpu_ids * PMD_SIZE; + + /* on non-NUMA, embedding is better */ + if (!pcpu_need_numa()) + return -EINVAL; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + return -EINVAL; + } + } + + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); return -EINVAL; + } /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { + if (pcpul_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, + PMD_SIZE); + if (!pcpul_map[cpu].ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); goto enomem; + } /* - * Only use pcpur_size bytes and give back the rest. + * Only use pcpul_size bytes and give back the rest. * * Ingo: The 2MB up-rounding bootmem is needed to make * sure the partial 2MB page is still fully RAM - it's * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); + free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), + PMD_SIZE - pcpul_size); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); } /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = nr_cpu_ids * PMD_SIZE; + vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { - pmd_t *pmd; + pmd_t *pmd, pmd_v; - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), - PAGE_KERNEL_LARGE)); + pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + + cpu * PMD_SIZE); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), + PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + "%zu bytes\n", pcpul_vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; + PMD_SIZE, pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < nr_cpu_ids - 1; i++) + for (j = i + 1; j < nr_cpu_ids; j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; enomem: for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); - return ret; + if (pcpul_map[cpu].ptr) + free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu PMD + * mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used PMD + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); + unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; + int left = 0, right = nr_cpu_ids - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < pmd_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > pmd_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * PMD_SIZE + offset; + } + } + + return NULL; } #else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { return -EINVAL; } @@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size) +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) * this. Also, embedding allocation doesn't play well with * NUMA. */ - if (!cpu_has_pse || pcpu_need_numa()) + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, @@ -297,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pcpu4k_nr_static_pages = PFN_UP(static_size); /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids * sizeof(pcpu4k_pages[0])); pcpu4k_pages = alloc_bootmem(pages_size); @@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) void *ptr; ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); goto enomem; + } memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); @@ -333,6 +416,16 @@ out_free_ar: return ret; } +/* for explicit first chunk allocator selection */ +static char pcpu_chosen_alloc[16] __initdata; + +static int __init percpu_alloc_setup(char *str) +{ + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; @@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_remap(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); + ret = -EINVAL; + if (strlen(pcpu_chosen_alloc)) { + if (strcmp(pcpu_chosen_alloc, "4k")) { + if (!strcmp(pcpu_chosen_alloc, "lpage")) + ret = setup_pcpu_lpage(static_size, true); + else if (!strcmp(pcpu_chosen_alloc, "embed")) + ret = setup_pcpu_embed(static_size, true); + else + pr_warning("PERCPU: unknown allocator %s " + "specified\n", pcpu_chosen_alloc); + if (ret < 0) + pr_warning("PERCPU: %s allocator failed (%zd), " + "falling back to 4k\n", + pcpu_chosen_alloc, ret); + } + } else { + ret = setup_pcpu_lpage(static_size, false); + if (ret < 0) + ret = setup_pcpu_embed(static_size, false); + } if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 124d40c..77b9689 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode) unsigned long pa; unsigned long m; unsigned long n; - unsigned long mmr_image; struct bau_desc *adp; struct bau_desc *ad2; @@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; - mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) { - uv_write_global_mmr64(pnode, (unsigned long) - UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - } + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | m)); /* * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each @@ -749,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode) * note that base_dest_nodeid is actually a nasid. */ ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; + ad2->header.dest_subnodeid = 0x10; /* the LB */ ad2->header.command = UV_NET_ENDPOINT_INTD; ad2->header.int_both = 1; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a0f48f5..5204332 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6e1a368..71f4368 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequencty. */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0; for (count = 0; count < 50000; count++) { - /* Ignore LSB */ - inb(0x42); - if (inb(0x42) != val) + if (!pit_verify_msb(val)) break; tsc = get_cycles(); } @@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void) * to do that is to just read back the 16-bit counter * once from the PIT. */ - inb(0x42); - inb(0x42); + pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { @@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void) * Iterate until the error is less than 500 ppm */ delta -= tsc; - if (d1+d2 < delta >> 11) - goto success; + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; } } printk("Fast TSC calibration failed\n"); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b263423..95a7289 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; ap.fs = __KERNEL_PERCPU; - ap.gs = 0; + ap.gs = __KERNEL_STACK_CANARY; ap.eflags = 0; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e878..9fc1782 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -46,11 +46,10 @@ PHDRS { data PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_X86_64 user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ + init PT_LOAD FLAGS(7); /* RWE */ #endif note PT_NOTE FLAGS(0); /* ___ */ } @@ -103,72 +102,43 @@ SECTIONS __stop___ex_table = .; } :text = 0x9090 - RODATA + RO_DATA(PAGE_SIZE) /* Data */ - . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Start of data section */ _sdata = .; - DATA_DATA - CONSTRUCTORS -#ifdef CONFIG_X86_64 - /* End of data section */ - _edata = .; -#endif - } :data + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) #ifdef CONFIG_X86_32 - /* 32 bit has nosave before _edata */ - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } + /* 32 bit has nosave before _edata */ + NOSAVE_DATA #endif - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) + PAGE_ALIGNED_DATA(PAGE_SIZE) *(.data.idt) - } -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -#endif - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } + CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) - /* rarely changed data like cpu maps */ -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); -#endif - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) + DATA_DATA + CONSTRUCTORS + + /* rarely changed data like cpu maps */ + READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) -#ifdef CONFIG_X86_32 /* End of data section */ _edata = .; -#endif - } + } :data #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) @@ -234,35 +204,29 @@ SECTIONS #endif /* CONFIG_X86_64 */ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ } -#ifdef CONFIG_X86_64 - :data.init -#endif +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * smp_locks might be freed after init - * start/end must be page aligned + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should + * start another segment - init. */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - } + PERCPU_VADDR(0, :percpu) +#endif - /* Init code and data - will be freed after init */ - . = ALIGN(PAGE_SIZE); .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; /* paired with __init_end */ _sinittext = .; INIT_TEXT _einittext = .; } +#ifdef CONFIG_X86_64 + :init +#endif .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA @@ -333,17 +297,7 @@ SECTIONS } #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif @@ -354,15 +308,22 @@ SECTIONS __init_end = .; } + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ + NOSAVE_DATA + } #endif /* BSS */ @@ -400,8 +361,8 @@ SECTIONS #ifdef CONFIG_X86_32 -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -414,12 +375,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -427,7 +388,7 @@ ASSERT((per_cpu__irq_stack_union == 0), #ifdef CONFIG_KEXEC #include <asm/kexec.h> -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4d6f0d2..21f68e0 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm) ktime_t remaining; struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + if (!ps->pit_timer.period) + return 0; + /* * The Counter does not stop when it reaches zero. In * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5c3d6e8..0ef5bb2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. + * + * Returns the number of rmap entries before the spte was added or zero if + * the spte was not added. + * */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; - int i; + int i, count = 0; if (!is_rmap_pte(*spte)) - return; + return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; @@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { desc = desc->more; + count += RMAP_EXT; + } if (desc->shadow_ptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; @@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) ; desc->shadow_ptes[i] = spte; } + return count; } static void rmap_desc_remove_entry(unsigned long *rmapp, @@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) return young; } +#define RMAP_RECYCLE_THRESHOLD 1000 + +static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +{ + unsigned long *rmapp; + + gfn = unalias_gfn(vcpu->kvm, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + + kvm_unmap_rmapp(vcpu->kvm, rmapp); + kvm_flush_remote_tlbs(vcpu->kvm); +} + int kvm_age_hva(struct kvm *kvm, unsigned long hva) { return kvm_handle_hva(kvm, hva, kvm_age_rmapp); @@ -1407,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { + int used_pages; + + used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; + used_pages = max(0, used_pages); + /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > - kvm_nr_mmu_pages) { - int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - - while (n_used_mmu_pages > kvm_nr_mmu_pages) { + if (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(kvm, page); - n_used_mmu_pages--; + used_pages--; } kvm->arch.n_free_mmu_pages = 0; } @@ -1740,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); + int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -1781,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, page_header_update_slot(vcpu->kvm, shadow_pte, gfn); if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn, largepage); + rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); if (!is_rmap_pte(*shadow_pte)) kvm_release_pfn_clean(pfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, gfn, largepage); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); @@ -2157,7 +2181,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) else /* 32 bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); - context->rsvd_bits_mask[1][0] = ~0ull; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = @@ -2170,7 +2194,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = ~0ull; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | @@ -2186,7 +2210,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = ~0ull; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; } } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 258e459..67785f6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -281,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *shadow_page; - u64 spte, *sptep; + u64 spte, *sptep = NULL; int direct; gfn_t table_gfn; int r; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 71510e0..b1f658a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_timers(vcpu); + svm->asid_generation = 0; } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) @@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - svm->vcpu.cpu = svm_data->cpu; svm->asid_generation = svm_data->asid_generation; svm->vmcb->control.asid = svm_data->next_asid++; } @@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm) struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - if (svm->vcpu.cpu != cpu || - svm->asid_generation != svm_data->asid_generation) + /* FIXME: handle wraparound of asid_generation */ + if (svm->asid_generation != svm_data->asid_generation) new_asid(svm, svm_data); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e770bf3..29f9129 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3012,6 +3012,12 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3151,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; - preempt_enable(); local_irq_enable(); + preempt_enable(); while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); @@ -3162,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, "emulation failure"); - return; + break; } if (signal_pending(current)) @@ -3171,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, schedule(); } - local_irq_disable(); preempt_disable(); + local_irq_disable(); vmx->invalid_state_emulation_result = err; } @@ -3198,6 +3204,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_VMCLEAR] = handle_vmx_insn, + [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, + [EXIT_REASON_VMPTRLD] = handle_vmx_insn, + [EXIT_REASON_VMPTRST] = handle_vmx_insn, + [EXIT_REASON_VMREAD] = handle_vmx_insn, + [EXIT_REASON_VMRESUME] = handle_vmx_insn, + [EXIT_REASON_VMWRITE] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmx_insn, + [EXIT_REASON_VMON] = handle_vmx_insn, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 249540f..3d45290 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr) return false; } +static bool valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static bool valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int i; + + if (!msr_mtrr_valid(msr)) + return false; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return false; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } + + /* variable MTRRs */ + return valid_mtrr_type(data & 0xff); +} + static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - if (!msr_mtrr_valid(msr)) + if (!mtrr_valid(vcpu, msr, data)) return 1; if (msr == MSR_MTRRdefType) { @@ -898,6 +935,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_VM_HSAVE_PA: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: data = 0; break; case MSR_MTRRcap: @@ -1078,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; - if (n < num_msrs_to_save) + if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msrs_to_save, num_msrs_to_save * sizeof(u32))) goto out; - if (copy_to_user(user_msr_list->indices - + num_msrs_to_save * sizeof(u32), + if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, ARRAY_SIZE(emulated_msrs) * sizeof(u32))) goto out; diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index c1b6c23..616de46 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1361,7 +1361,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return 0; } -void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) { u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); /* diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7bc65f0..d677fa9 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -22,7 +22,8 @@ * * So how does the kernel know it's a Guest? We'll see that later, but let's * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. :*/ + * "paravirt" structures with our Guest versions, then boot like normal. +:*/ /* * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. @@ -74,7 +75,8 @@ * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). :*/ + * same kernel as the Host (or at least, built from the same source code). +:*/ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, @@ -85,7 +87,8 @@ struct lguest_data lguest_data = { .syscall_vec = SYSCALL_VECTOR, }; -/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a +/*G:037 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, @@ -94,7 +97,8 @@ struct lguest_data lguest_data = { * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! */ + * which empties it for next time! + */ static void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, static unsigned int next_call; unsigned long flags; - /* Disable interrupts if not already disabled: we don't want an + /* + * Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing - * one! */ + * one! + */ local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ @@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_restore(flags); } -/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first - * real optimization trick! +/*G:035 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real + * optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls @@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: */ + * future processing: + */ static void lazy_hcall1(unsigned long call, unsigned long arg1) { @@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call, async_hcall(call, arg1, 0, 0, 0); } +/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, unsigned long arg1, unsigned long arg2) @@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call, } #endif -/* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. */ +/*G:036 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then + * issue the do-nothing hypercall to flush any stored calls. +:*/ static void lguest_leave_lazy_mmu_mode(void) { kvm_hypercall0(LHCALL_FLUSH_ASYNC); @@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next) * check there before it tries to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "flags"). The +/* + * save_flags() is expected to return the processor state (ie. "flags"). The * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. */ + * about the interrupt flag. Our "save_flags()" just returns that. + */ static unsigned long save_fl(void) { return lguest_data.irq_enabled; @@ -222,13 +235,15 @@ static void irq_disable(void) lguest_data.irq_enabled = 0; } -/* Let's pause a moment. Remember how I said these are called so often? +/* + * Let's pause a moment. Remember how I said these are called so often? * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to * break some rules. In particular, these functions are assumed to save their * own registers if they need to: normal C functions assume they can trash the * eax register. To use normal C functions, we use * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. */ + * C function, then restores it. + */ PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); /*:*/ @@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); -/*M:003 Note that we don't check for outstanding interrupts when we re-enable - * them (or when we unmask an interrupt). This seems to work for the moment, - * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way - * would be to put the "irq_enabled" field in a page by itself, and have the - * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. +/*M:003 + * We could be more efficient in our checking of outstanding interrupts, rather + * than using a branch. One way would be to put the "irq_enabled" field in a + * page by itself, and have the Host write-protect it when an interrupt comes + * in when irqs are disabled. There will then be a page fault as soon as + * interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. :*/ + * a hypercall for interrupt control and not worry about efficiency. +:*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags); static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { - /* The gate_desc structure is 8 bytes long: we hand it to the Host in + /* + * The gate_desc structure is 8 bytes long: we hand it to the Host in * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors * around like this; typesafety wasn't a big concern in Linux's early - * years. */ + * years. + */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt, kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } -/* Changing to a different IDT is very rare: we keep the IDT up-to-date every +/* + * Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the - * Host about them. */ + * Host about them. + */ static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; @@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); } -/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, +/* + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. */ + * that this naive implementation is reasonable. + */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) { @@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, dt[entrynum].a, dt[entrynum].b); } -/* OK, I lied. There are three "thread local storage" GDT entries which change +/* + * OK, I lied. There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. */ + * __thread variables). So we have a hypercall specifically for this case. + */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { - /* There's one problem which normal hardware doesn't have: the Host + /* + * There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. */ + * the GS register here: if it's needed it'll be reloaded anyway. + */ lazy_load_gs(0); lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); } -/*G:038 That's enough excitement for now, back to ploughing through each of - * the different pv_ops structures (we're about 1/3 of the way through). +/*G:038 + * That's enough excitement for now, back to ploughing through each of the + * different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. */ + * here, so they'll get an informative and friendly Segmentation Fault. + */ static void lguest_set_ldt(const void *addr, unsigned entries) { } -/* This loads a GDT entry into the "Task Register": that entry points to a +/* + * This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. @@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. */ + * override the native version with a do-nothing version. + */ static void lguest_load_tr_desc(void) { } -/* The "cpuid" instruction is a way of querying both the CPU identity +/* + * The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. * As you might imagine, after a decade and a half this treatment, it is now a * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 4 languages. I am not making this up! + * has been translated into 5 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned @@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void) * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. */ + * too worked up about it. + */ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { @@ -379,38 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, native_cpuid(ax, bx, cx, dx); switch (function) { - case 1: /* Basic feature request. */ - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + /* + * CPUID 0 gives the highest legal CPUID number (and the ID string). + * We futureproof our code a little by sticking to known CPUID values. + */ + case 0: + if (*ax > 5) + *ax = 5; + break; + + /* + * CPUID 1 is a basic feature request. + * + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. + */ + case 1: *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ *dx &= 0x07808151; - /* The Host can do a nice optimization if it knows that the + /* + * The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. */ + * the Page Global Enable (PGE) feature bit is set. + */ *dx |= 0x00002000; - /* We also lie, and say we're family id 5. 6 or greater + /* + * We also lie, and say we're family id 5. 6 or greater * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. */ + * Family ID is returned as bits 8-12 in ax. + */ *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + /* + * 0x80000000 returns the highest Extended Function, so we futureproof + * like we do above by limiting it to known fields. + */ case 0x80000000: - /* Futureproof this a little: if they ask how much extended - * processor information there is, limit it to known fields. */ if (*ax > 0x80000008) *ax = 0x80000008; break; + + /* + * PAE systems can mark pages as non-executable. Linux calls this the + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced + * Virus Protection). We just switch turn if off here, since we don't + * support it. + */ case 0x80000001: - /* Here we should fix nx cap depending on host. */ - /* For this version of PAE, we just clear NX bit. */ *dx &= ~(1 << 20); break; } } -/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. +/* + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). @@ -425,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ + * wants to read it and we'd prefer not to bother the Host unnecessarily. + */ static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { @@ -438,18 +495,22 @@ static unsigned long lguest_read_cr0(void) return current_cr0; } -/* Intel provided a special instruction to clear the TS bit for people too cool +/* + * Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. */ + * the vowels have been optimized out. + */ static void lguest_clts(void) { lazy_hcall1(LHCALL_TS, 0); current_cr0 &= ~X86_CR0_TS; } -/* cr2 is the virtual address of the last page fault, which the Guest only ever +/* + * cr2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. */ + * just read it out of there. + */ static unsigned long lguest_read_cr2(void) { return lguest_data.cr2; @@ -458,10 +519,12 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; -/* cr3 is the current toplevel pagetable page: the principle is the same as +/* + * cr3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. The only * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. */ + * to set it upon our initial hypercall. + */ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; @@ -506,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) * cr3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | - * Top-level | | PADDR2 | + * Mid-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | @@ -526,21 +589,62 @@ static void lguest_write_cr4(unsigned long val) * Index into top Index into second Offset within page * page directory page pagetable page * - * The kernel spends a lot of time changing both the top-level page directory - * and lower-level pagetable pages. The Guest doesn't know physical addresses, - * so while it maintains these page tables exactly like normal, it also needs - * to keep the Host informed whenever it makes a change: the Host will create - * the real page tables based on the Guests'. + * Now, unfortunately, this isn't the whole story: Intel added Physical Address + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). + * These are held in 64-bit page table entries, so we can now only fit 512 + * entries in a page, and the neat three-level tree breaks down. + * + * The result is a four level page table: + * + * cr3 --> [ 4 Upper ] + * [ Level ] + * [ Entries ] + * [(PUD Page)]---> +---------+ + * | --------->+---------+ + * | | | PADDR1 | + * Mid-level | | PADDR2 | + * (PMD) page | | | + * | | Lower-level | + * | | (PTE) page | + * | | | | + * .... .... + * + * + * And the virtual address is decoded as: + * + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| + * Index into Index into mid Index into lower Offset within page + * top entries directory page pagetable page + * + * It's too hard to switch between these two formats at runtime, so Linux only + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many + * distributions turn it on, and not just for people with silly amounts of + * memory: the larger PTE entries allow room for the NX bit, which lets the + * kernel disable execution of pages and increase security. + * + * This was a problem for lguest, which couldn't run on these distributions; + * then Matias Zabaljauregui figured it all out and implemented it, and only a + * handful of puppies were crushed in the process! + * + * Back to our point: the kernel spends a lot of time changing both the + * top-level page directory and lower-level pagetable pages. The Guest doesn't + * know physical addresses, so while it maintains these page tables exactly + * like normal, it also needs to keep the Host informed whenever it makes a + * change: the Host will create the real page tables based on the Guests'. */ -/* The Guest calls this to set a second-level entry (pte), ie. to map a page - * into a process' address space. We set the entry then tell the Host the - * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). */ +/* + * The Guest calls this after it has set a second-level entry (pte), ie. to map + * a page into a process' address space. Wetell the Host the toplevel and + * address this corresponds to. The Guest uses one pagetable per process, so + * we need to tell the Host which one we're changing (mm->pgd). + */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_X86_PAE + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low, ptep->pte_high); #else @@ -548,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, #endif } +/* This is the "set and update" combo-meal-deal version. */ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -555,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd +/* + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd * to set a middle-level entry when PAE is activated. + * * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. */ + * and the index of the entry we changed. + */ #ifdef CONFIG_X86_PAE static void lguest_set_pud(pud_t *pudp, pud_t pudval) { @@ -577,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #else -/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not - * activated. */ +/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); @@ -587,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #endif -/* There are a couple of legacy places where the kernel sets a PTE, but we +/* + * There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. @@ -595,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. */ + * which brings boot back to 0.25 seconds. + */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { native_set_pte(ptep, pteval); @@ -604,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) } #ifdef CONFIG_X86_PAE +/* + * With 64-bit PTE values, we need to be careful setting them: if we set 32 + * bits at a time, the hardware could see a weird half-set entry. These + * versions ensure we update all 64 bits at once. + */ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte_atomic(ptep, pte); @@ -611,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { native_pte_clear(mm, addr, ptep); lguest_pte_update(mm, addr, ptep); } -void lguest_pmd_clear(pmd_t *pmdp) +static void lguest_pmd_clear(pmd_t *pmdp) { lguest_set_pmd(pmdp, __pmd(0)); } #endif -/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on +/* + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). @@ -632,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp) * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). */ + * bit is zero). + */ static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); } -/* This is what happens after the Guest has removed a large number of entries. +/* + * This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. */ + * have changed, ie. virtual addresses below PAGE_OFFSET. + */ static void lguest_flush_tlb_user(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 0); } -/* This is called when the kernel page tables have changed. That's not very +/* + * This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. */ + * slow), so it's worth separating this from the user flushing above. + */ static void lguest_flush_tlb_kernel(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 1); @@ -686,26 +807,38 @@ static struct irq_chip lguest_irq_controller = { .unmask = enable_lguest_irq, }; -/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware +/* + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. */ + * lguest interrupt controller. + */ static void __init lguest_init_IRQ(void) { unsigned int i; for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* Some systems map "vectors" to interrupts weirdly. Lguest has - * a straightforward 1 to 1 mapping, so force that here. */ + /* Some systems map "vectors" to interrupts weirdly. Not us! */ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } - /* This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. */ + + /* + * This call is required to set up for 4k stacks, where we have + * separate stacks for hard and soft interrupts. + */ irq_ctx_init(smp_processor_id()); } +/* + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so + * rather than set them in lguest_init_IRQ we are called here every time an + * lguest device needs an interrupt. + * + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should + * pass that up! + */ void lguest_setup_irq(unsigned int irq) { irq_to_desc_alloc_node(irq, 0); @@ -724,31 +857,39 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us +/* + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. */ + * TSC clock will give up and not register itself. + */ static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. */ +/* + * If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. + */ static cycle_t lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; - /* Since the time is in two parts (seconds and nanoseconds), we risk + /* + * Since the time is in two parts (seconds and nanoseconds), we risk * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: */ + * of time travel, we must be careful: + */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; - /* This read memory barrier tells the compiler and the CPU that + /* + * This read memory barrier tells the compiler and the CPU that * this can't be reordered: we have to complete the above - * before going on. */ + * before going on. + */ rmb(); /* Now we read the nanoseconds part. */ nsec = lguest_data.time.tv_nsec; @@ -772,9 +913,11 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* We also need a "struct clock_event_device": Linux asks us to set it to go +/* + * We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. */ + * just applied the patch. + */ static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -824,8 +967,10 @@ static struct clock_event_device lguest_clockevent = { .max_delta_ns = LG_CLOCK_MAX_DELTA, }; -/* This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. */ +/* + * This is the Guest timer interrupt handler (hardware interrupt 0). We just + * call the clockevent infrastructure and it does whatever needs doing. + */ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) { unsigned long flags; @@ -836,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) local_irq_restore(flags); } -/* At some point in the boot process, we get asked to set up our timing +/* + * At some point in the boot process, we get asked to set up our timing * infrastructure. The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. */ + * lguest_data" so that timer interrupts were blocked until now. + */ static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ @@ -863,14 +1010,16 @@ static void lguest_time_init(void) * to work. They're pretty simple. */ -/* The Guest needs to tell the Host what stack it expects traps to use. For +/* + * The Guest needs to tell the Host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. */ + * of pages in the stack. + */ static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -884,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) /* FIXME: Implement */ } -/* There are times when the kernel wants to make sure that no memory writes are +/* + * There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * @@ -898,11 +1048,13 @@ static void lguest_wbinvd(void) { } -/* If the Guest expects to have an Advanced Programmable Interrupt Controller, +/* + * If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. */ + * does, however, allow us to get through the Linux boot code. + */ #ifdef CONFIG_X86_LOCAL_APIC static void lguest_apic_write(u32 reg, u32 v) { @@ -951,11 +1103,13 @@ static void lguest_safe_halt(void) kvm_hypercall0(LHCALL_HALT); } -/* The SHUTDOWN hypercall takes a string to describe what's happening, and +/* + * The SHUTDOWN hypercall takes a string to describe what's happening, and * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. */ + * rather than virtual addresses, so we use __pa() here. + */ static void lguest_power_off(void) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), @@ -986,8 +1140,10 @@ static __init char *lguest_memory_setup(void) * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. */ + /* + *The Linux bootloader header contains an "e820" memory map: the + * Launcher populated the first entry with our memory limit. + */ e820_add_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); @@ -996,16 +1152,17 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } -/* We will eventually use the virtio console device to produce console output, +/* + * We will eventually use the virtio console device to produce console output, * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. */ + * console output. + */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { char scratch[17]; unsigned int len = count; - /* We use a nul-terminated string, so we have to make a copy. Icky, - * huh? */ + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ if (len > sizeof(scratch) - 1) len = sizeof(scratch) - 1; scratch[len] = '\0'; @@ -1016,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } -/* Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. */ +/* + * Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. + */ static void lguest_restart(char *reason) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); @@ -1044,7 +1203,8 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. */ + * and these are in i386_head.S. + */ /*G:060 We construct a table from the assembler templates: */ static const struct lguest_insns @@ -1055,9 +1215,11 @@ static const struct lguest_insns [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; -/* Now our patch routine is fairly simple (based on the native one in +/* + * Now our patch routine is fairly simple (based on the native one in * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. */ + * the available space we used. + */ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, unsigned long addr, unsigned len) { @@ -1069,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, insn_len = lguest_insns[type].end - lguest_insns[type].start; - /* Similarly if we can't fit replacement (shouldn't happen, but let's - * be thorough). */ + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ if (len < insn_len) return paravirt_patch_default(type, clobber, ibuf, addr, len); @@ -1079,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:030 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 + * Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. */ + * have to override to avoid privileged instructions. + */ __init void lguest_init(void) { - /* We're under lguest, paravirt is enabled, and we're running at - * privilege level 1, not 0 as normal. */ + /* We're under lguest. */ pv_info.name = "lguest"; + /* Paravirt is enabled. */ pv_info.paravirt_enabled = 1; + /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; + /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - /* We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. */ + /* + * We set up all the lguest overrides for sensitive operations. These + * are detailed with the operations themselves. + */ - /* interrupt-related operations */ + /* Interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); @@ -1102,11 +1269,11 @@ __init void lguest_init(void) pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; - /* init-time operations */ + /* Setup operations */ pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; - /* Intercepts of various cpu instructions */ + /* Intercepts of various CPU instructions */ pv_cpu_ops.load_gdt = lguest_load_gdt; pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; @@ -1127,7 +1294,7 @@ __init void lguest_init(void) pv_cpu_ops.start_context_switch = paravirt_start_context_switch; pv_cpu_ops.end_context_switch = lguest_end_context_switch; - /* pagetable management */ + /* Pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; @@ -1149,54 +1316,71 @@ __init void lguest_init(void) pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC - /* apic read/write intercepts */ + /* APIC read/write intercepts */ set_lguest_basic_apic_ops(); #endif - /* time operations */ + /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; - /* Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). */ + /* + * Now is a good time to look at the implementations of these functions + * before returning to the rest of lguest_init(). + */ - /*G:070 Now we've seen all the paravirt_ops, we return to + /*G:070 + * Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. */ + * occurs. + */ - /* The stack protector is a weird thing where gcc places a canary + /* + * The stack protector is a weird thing where gcc places a canary * value on the stack and then checks it on return. This file is * compiled with -fno-stack-protector it, so we got this far without * problems. The value of the canary is kept at offset 20 from the * %gs register, so we need to set that up before calling C functions - * in other files. */ + * in other files. + */ setup_stack_canary_segment(0); - /* We could just call load_stack_canary_segment(), but we might as - * call switch_to_new_gdt() which loads the whole table and sets up - * the per-cpu segment descriptor register %fs as well. */ + + /* + * We could just call load_stack_canary_segment(), but we might as well + * call switch_to_new_gdt() which loads the whole table and sets up the + * per-cpu segment descriptor register %fs as well. + */ switch_to_new_gdt(0); - /* As described in head_32.S, we map the first 128M of memory. */ + /* We actually boot with all memory mapped, but let's say 128MB. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* The Host<->Guest Switcher lives at the top of our address space, and + /* + * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem */ + * it put the answer in lguest_data.reserve_mem + */ reserve_top_address(lguest_data.reserve_mem); - /* If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. */ + /* + * If we don't initialize the lock dependency checker now, it crashes + * paravirt_disable_iospace. + */ lockdep_init(); - /* The IDE code spends about 3 seconds probing for disks: if we reserve + /* + * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. + */ paravirt_disable_iospace(); - /* This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: */ + /* + * This is messy CPU setup stuff which the native boot code does before + * start_kernel, so we have to do, too: + */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ new_cpu_data.x86_capability[0] = cpuid_edx(1); @@ -1213,22 +1397,28 @@ __init void lguest_init(void) acpi_ht = 0; #endif - /* We set the preferred console to "hvc". This is the "hypervisor + /* + * We set the preferred console to "hvc". This is the "hypervisor * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. */ + * adapted for lguest's use. + */ add_preferred_console("hvc", 0, NULL); /* Register our very early console. */ virtio_cons_early_init(early_put_chars); - /* Last of all, we set the power management poweroff hook to point to + /* + * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart - * routine. */ + * routine. + */ pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. */ + /* + * Now we're set up, call i386_start_kernel() in head32.c and we proceed + * to boot as normal. It never returns. + */ i386_start_kernel(); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe..27eac0f 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,7 +5,8 @@ #include <asm/thread_info.h> #include <asm/processor-flags.h> -/*G:020 Our story starts with the kernel booting into startup_32 in +/*G:020 + * Our story starts with the kernel booting into startup_32 in * arch/x86/kernel/head_32.S. It expects a boot header, which is created by * the bootloader (the Launcher in our case). * @@ -21,11 +22,14 @@ * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after - * boot. */ + * boot. + */ .section .init.text, "ax", @progbits ENTRY(lguest_entry) - /* We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. */ + /* + * We make the "initialization" hypercall now to tell the Host about + * us, and also find out where it put our page tables. + */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ @@ -33,13 +37,14 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the - * moment. */ + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/*G:055 We create a macro which puts the assembler code between lgstart_ and - * lgend_ markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. */ +/*G:055 + * We create a macro which puts the assembler code between lgstart_ and lgend_ + * markers. These templates are put in the .text section: they can't be + * discarded after boot as we may need to patch modules, too. + */ .text #define LGUEST_PATCH(name, insns...) \ lgstart_##name: insns; lgend_##name:; \ @@ -48,83 +53,103 @@ ENTRY(lguest_entry) LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*G:033 But using those wrappers is inefficient (we'll see why that doesn't - * matter for save_fl and irq_disable later). If we write our routines - * carefully in assembler, we can avoid clobbering any registers and avoid - * jumping through the wrapper functions. +/*G:033 + * But using those wrappers is inefficient (we'll see why that doesn't matter + * for save_fl and irq_disable later). If we write our routines carefully in + * assembler, we can avoid clobbering any registers and avoid jumping through + * the wrapper functions. * * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine - * to enable interrupts: */ + * in a bit more detail so I'll describe in easy stages. First, the routine to + * enable interrupts: + */ ENTRY(lg_irq_enable) - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + /* + * The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). + */ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* But now we need to check if the Host wants to know: there might have + /* + * But now we need to check if the Host wants to know: there might have * been interrupts waiting to be delivered, in which case it will have * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. */ + * jump to send_interrupts, otherwise we're done. + */ testl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts - /* One cool thing about x86 is that you can do many things without using + /* + * One cool thing about x86 is that you can do many things without using * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! */ + * restore any registers at all! + */ ret send_interrupts: - /* OK, now we need a register: eax is used for the hypercall number, + /* + * OK, now we need a register: eax is used for the hypercall number, * which is LHCALL_SEND_INTERRUPTS. * * We used not to bother with this pending detection at all, which was * much simpler. Sooner or later the Host would realize it had to * send us an interrupt. But that turns out to make performance 7 * times worse on a simple tcp benchmark. So now we do this the hard - * way. */ + * way. + */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is a vmcall instruction (same thing that KVM uses). Older + /* + * This is a vmcall instruction (same thing that KVM uses). Older * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. */ + * create one manually here. + */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* Put eax back the way we found it. */ popl %eax ret -/* Finally, the "popf" or "restore flags" routine. The %eax register holds the +/* + * Finally, the "popf" or "restore flags" routine. The %eax register holds the * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. */ + * enabling interrupts again, if it's 0 we're leaving them off. + */ ENTRY(lg_restore_fl) /* This is just "lguest_data.irq_enabled = flags;" */ movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* Now, if the %eax value has enabled interrupts and + /* + * Now, if the %eax value has enabled interrupts and * lguest_data.irq_pending is set, we want to tell the Host so it can * deliver any outstanding interrupts. Fortunately, both values will * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. */ + * jump to send_interrupts. + */ testl lguest_data+LGUEST_DATA_irq_pending, %eax jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ ret +/*:*/ /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start .global lguest_noirq_end -/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, - * it sets the eflags interrupt bit on the stack based on - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when - * restoring it. However, when the Host sets the Guest up for direct traps, - * such as system calls, the processor is the one to push eflags onto the - * stack, and the interrupt bit will be 1 (in reality, interrupts are always - * enabled in the Guest). +/*M:004 + * When the Host reflects a trap or injects an interrupt into the Guest, it + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, + * so the Guest iret logic does the right thing when restoring it. However, + * when the Host sets the Guest up for direct traps, such as system calls, the + * processor is the one to push eflags onto the stack, and the interrupt bit + * will be 1 (in reality, interrupts are always enabled in the Guest). * * This turns out to be harmless: the only trap which should happen under Linux * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc * regions), which has to be reflected through the Host anyway. If another * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! :*/ + * we'll never get to this iret! +:*/ -/*G:045 There is one final paravirt_op that the Guest implements, and glancing - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! +/*G:045 + * There is one final paravirt_op that the Guest implements, and glancing at it + * you can see why I left it to last. It's *cool*! It's in *assembler*! * * The "iret" instruction is used to return from an interrupt or trap. The * stack looks like this: @@ -148,15 +173,18 @@ ENTRY(lg_restore_fl) * return to userspace or wherever. Our solution to this is to surround the * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. */ + * enabled. + */ ENTRY(lguest_iret) pushl %eax movl 12(%esp), %eax lguest_noirq_start: - /* Note the %ss: segment prefix here. Normal data accesses use the + /* + * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. */ + * prefix makes sure we use the stack segment, which is still valid. + */ movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled popl %eax iret diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f9d3563..07c3189 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) + obj-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c new file mode 100644 index 0000000..824fa0b --- /dev/null +++ b/arch/x86/lib/atomic64_32.c @@ -0,0 +1,230 @@ +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/types.h> + +#include <asm/processor.h> +#include <asm/cmpxchg.h> +#include <asm/atomic.h> + +static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) +{ + u32 low = new; + u32 high = new >> 32; + + asm volatile( + LOCK_PREFIX "cmpxchg8b %1\n" + : "+A" (old), "+m" (*ptr) + : "b" (low), "c" (high) + ); + return old; +} + +u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) +{ + return cmpxchg8b(&ptr->counter, old_val, new_val); +} +EXPORT_SYMBOL(atomic64_cmpxchg); + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ +u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) +{ + /* + * Try first with a (possibly incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, real_val = 0; + + do { + old_val = real_val; + + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); + + return old_val; +} +EXPORT_SYMBOL(atomic64_xchg); + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +void atomic64_set(atomic64_t *ptr, u64 new_val) +{ + atomic64_xchg(ptr, new_val); +} +EXPORT_SYMBOL(atomic64_set); + +/** +EXPORT_SYMBOL(atomic64_read); + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) +{ + /* + * Try first with a (possibly incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, new_val, real_val = 0; + + do { + old_val = real_val; + new_val = old_val + delta; + + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); + + return new_val; +} +EXPORT_SYMBOL(atomic64_add_return); + +u64 atomic64_sub_return(u64 delta, atomic64_t *ptr) +{ + return atomic64_add_return(-delta, ptr); +} +EXPORT_SYMBOL(atomic64_sub_return); + +u64 atomic64_inc_return(atomic64_t *ptr) +{ + return atomic64_add_return(1, ptr); +} +EXPORT_SYMBOL(atomic64_inc_return); + +u64 atomic64_dec_return(atomic64_t *ptr) +{ + return atomic64_sub_return(1, ptr); +} +EXPORT_SYMBOL(atomic64_dec_return); + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +void atomic64_add(u64 delta, atomic64_t *ptr) +{ + atomic64_add_return(delta, ptr); +} +EXPORT_SYMBOL(atomic64_add); + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +void atomic64_sub(u64 delta, atomic64_t *ptr) +{ + atomic64_add(-delta, ptr); +} +EXPORT_SYMBOL(atomic64_sub); + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) +{ + u64 new_val = atomic64_sub_return(delta, ptr); + + return new_val == 0; +} +EXPORT_SYMBOL(atomic64_sub_and_test); + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +void atomic64_inc(atomic64_t *ptr) +{ + atomic64_add(1, ptr); +} +EXPORT_SYMBOL(atomic64_inc); + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +void atomic64_dec(atomic64_t *ptr) +{ + atomic64_sub(1, ptr); +} +EXPORT_SYMBOL(atomic64_dec); + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +int atomic64_dec_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(1, ptr); +} +EXPORT_SYMBOL(atomic64_dec_and_test); + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +int atomic64_inc_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(-1, ptr); +} +EXPORT_SYMBOL(atomic64_inc_and_test); + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +int atomic64_add_negative(u64 delta, atomic64_t *ptr) +{ + s64 new_val = atomic64_add_return(delta, ptr); + + return new_val < 0; +} +EXPORT_SYMBOL(atomic64_add_negative); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 9a10a78..ebeafcc 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -5,15 +5,14 @@ * Zero a page. * rdi page */ - ALIGN -clear_page_c: +ENTRY(clear_page_c) CFI_STARTPROC movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page) +ENDPROC(clear_page_c) ENTRY(clear_page) CFI_STARTPROC diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index f118c11..6ba0f7b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -75,6 +75,7 @@ ENTRY(copy_to_user) jae bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC +ENDPROC(copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(copy_from_user) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f456860..ff485d3 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops) preempt_disable(); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); for (;;) { + rdtsc_barrier(); rdtscl(now); if ((now - bclock) >= loops) break; @@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops) if (unlikely(cpu != smp_processor_id())) { loops -= (now - bclock); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); } } diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 1440b9c..caa24ac 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) rv.msrs = msrs; rv.msr_no = msr_no; - preempt_disable(); - /* - * FIXME: handle the CPU we're executing on separately for now until - * smp_call_function_many has been fixed to not skip it. - */ - this_cpu = raw_smp_processor_id(); - smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1); + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + __rdmsr_on_cpu(&rv); smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); - preempt_enable(); + put_cpu(); } EXPORT_SYMBOL(rdmsr_on_cpus); @@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) rv.msrs = msrs; rv.msr_no = msr_no; - preempt_disable(); - /* - * FIXME: handle the CPU we're executing on separately for now until - * smp_call_function_many has been fixed to not skip it. - */ - this_cpu = raw_smp_processor_id(); - smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1); + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + __wrmsr_on_cpu(&rv); smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); - preempt_enable(); + put_cpu(); } EXPORT_SYMBOL(wrmsr_on_cpus); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7c8ca91..1f118d4 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -751,7 +751,7 @@ survive: if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto survive; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78a5fff..bfae139 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address) } static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; /* * No vm86 mode in 64-bit mode: @@ -696,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 58f621e..2112ed5 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_prot); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f53b57e..0607119 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,6 +12,7 @@ #include <asm/system.h> #include <asm/tlbflush.h> #include <asm/tlb.h> +#include <asm/proto.h> DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -177,20 +178,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} -#endif - /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -210,9 +197,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - if (!after_bootmem) - init_gbpages(); - #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c4378f4..ea56b8c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -598,6 +598,15 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); + + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_NORMAL_MEMORY); + free_area_init_nodes(max_zone_pfns); } @@ -787,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, return ret; #else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + reserve_bootmem(phys, len, flags); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cfe9ce..7e600c1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -11,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/pfn.h> #include <asm/e820.h> #include <asm/processor.h> @@ -590,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_PAGES_ARRAY) - address = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + address = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; @@ -681,8 +685,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; - int ret = 0; - unsigned long temp_cpa_vaddr, vaddr; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr, remapped; + int ret; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -695,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_PAGES_ARRAY) - vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + vaddr = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) vaddr = cpa->vaddr[cpa->curpage]; else vaddr = *cpa->vaddr; @@ -706,42 +714,55 @@ static int cpa_process_alias(struct cpa_data *cpa) PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; - temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; } #ifdef CONFIG_X86_64 - if (ret) - return ret; - /* - * No need to redo, when the primary call touched the high - * mapping already: - */ - if (within(vaddr, (unsigned long) _text, _brk_end)) - return 0; - /* - * If the physical address is inside the kernel map, we need + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) - return 0; + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - alias_cpa = *cpa; - temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; - alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif /* - * The high mapping range is imprecise, so ignore the return value. + * If the PMD page was partially used for per-cpu remapping, + * the recycled area needs to be split and modified. Because + * the area is always proper subset of a PMD page + * cpa->numpages is guaranteed to be 1 for these areas, so + * there's no need to loop over and check for further remaps. */ - __change_page_attr_set_clr(&alias_cpa, 0); -#endif - return ret; + remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); + if (remapped) { + WARN_ON(cpa->numpages > 1); + alias_cpa = *cpa; + alias_cpa.vaddr = &remapped; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + + return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) @@ -982,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; + unsigned long addr_copy = addr; + ret = change_page_attr_set(&addr, numpages, __pgprot(_PAGE_CACHE_UC_MINUS), 0); - if (!ret) { - ret = change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_WC), 0); + ret = change_page_attr_set_clr(&addr_copy, numpages, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); } return ret; } @@ -1104,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) int free_idx; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) goto err_out; @@ -1117,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) err_out: free_idx = i; for (i = 0; i < free_idx; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } @@ -1146,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray) return retval; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb..352aa9e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -623,7 +623,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, return ret; if (flags != want_flags) { - if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" " for %Lx-%Lx, got %s\n", diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8e43bdd..ed34f5e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); @@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) } #if PAGETABLE_LEVELS > 2 -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } #if PAGETABLE_LEVELS > 3 -void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); @@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve) printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); __FIXADDR_TOP = -reserve - PAGE_SIZE; - __VMALLOC_RESERVE += reserve; #endif } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 2dfcbf9..dbb5381 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -79,8 +79,10 @@ static __init void bad_srat(void) acpi_numa = -1; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; - for (i = 0; i < MAX_NUMNODES; i++) - nodes_add[i].start = nodes[i].end = 0; + for (i = 0; i < MAX_NUMNODES; i++) { + nodes[i].start = nodes[i].end = 0; + nodes_add[i].start = nodes_add[i].end = 0; + } remove_all_active_ranges(); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 821e970..c814e14 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = mm; f->flush_va = va; - cpumask_andnot(to_cpumask(f->flush_cpumask), - cpumask, cpumask_of(smp_processor_id())); - - /* - * We have to send the IPI only to - * CPUs affected. - */ - apic->send_IPI_mask(to_cpumask(f->flush_cpumask), - INVALIDATE_TLB_VECTOR_START + sender); + if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { + /* + * We have to send the IPI only to + * CPUs affected. + */ + apic->send_IPI_mask(to_cpumask(f->flush_cpumask), + INVALIDATE_TLB_VECTOR_START + sender); - while (!cpumask_empty(to_cpumask(f->flush_cpumask))) - cpu_relax(); + while (!cpumask_empty(to_cpumask(f->flush_cpumask))) + cpu_relax(); + } f->flush_mm = NULL; f->flush_va = 0; diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b07dd8d..89b9a5c 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type) static int force_arch_perfmon; static int force_cpu_type(const char *str, struct kernel_param *kp) { - if (!strcmp(str, "archperfmon")) { + if (!strcmp(str, "arch_perfmon")) { force_arch_perfmon = 1; printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index b26626d..1014eb4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -68,6 +68,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data) unsigned long flags; struct resource *root; int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; + u64 start, end; + + if (bus_has_transparent_bridge(info->bus)) + max_root_bus_resources -= 3; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -84,25 +88,24 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = addr.minimum + addr.translation_offset; - res->end = res->start + addr.address_length - 1; - res->child = NULL; - - if (bus_has_transparent_bridge(info->bus)) - max_root_bus_resources -= 3; + start = addr.minimum + addr.translation_offset; + end = start + addr.address_length - 1; if (info->res_num >= max_root_bus_resources) { printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " "from %s for %s due to _CRS returning more than " - "%d resource descriptors\n", (unsigned long) res->start, - (unsigned long) res->end, root->name, info->name, + "%d resource descriptors\n", (unsigned long) start, + (unsigned long) end, root->name, info->name, max_root_bus_resources); - info->res_num++; return AE_OK; } + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + if (insert_resource(root, res)) { printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " "from %s for %s\n", (unsigned long) res->start, @@ -115,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } static void -adjust_transparent_bridge_resources(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - int i; - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) { - for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++) - dev->subordinate->resource[i] = - dev->bus->resource[i - 3]; - } - } -} - -static void get_current_resources(struct acpi_device *device, int busnum, int domain, struct pci_bus *bus) { @@ -158,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum, info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, &info); - if (info.res_num) - adjust_transparent_bridge_resources(bus); return; @@ -222,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do */ memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); - } else - bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + } else { + bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); + if (bus) { + if (pci_probe & PCI_USE__CRS) + get_current_resources(device, busnum, domain, + bus); + bus->subordinate = pci_scan_child_bus(bus); + } + } if (!bus) kfree(sd); @@ -238,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do #endif } - if (bus && (pci_probe & PCI_USE__CRS)) - get_current_resources(device, busnum, domain, bus); return bus; } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index f893d6a..3ffa10d 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) int j; struct pci_root_info *info; - /* don't go for it if _CRS is used */ - if (pci_probe & PCI_USE__CRS) + /* don't go for it if _CRS is used already */ + if (b->resource[0] != &ioport_resource || + b->resource[1] != &iomem_resource) return; /* if only one root bus, don't need to anything */ @@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) if (i == pci_root_num) return; + printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", + b->number); + info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 0fb56db..52e62e5 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -35,6 +35,7 @@ #include <asm/pat.h> #include <asm/e820.h> #include <asm/pci_x86.h> +#include <asm/io_apic.h> static int @@ -227,6 +228,12 @@ void __init pcibios_resource_survey(void) pcibios_allocate_resources(1); e820_reserve_resources_late(); + /* + * Insert the IO APIC resources after PCI initialization has + * occured to handle IO APICS that are mapped in on a BAR in + * PCI space, but before trying to assign unassigned pci res. + */ + ioapic_insert_resources(); } /** diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index de2abbd..a6a198c 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,7 +1,7 @@ # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_cpu_$(BITS).o := $(nostackp) +CFLAGS_cpu.o := $(nostackp) obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index d277ef1..b3d20b9 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -244,7 +244,7 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); mtrr_ap_init(); -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); #endif } diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 172438f..7410640 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -5,6 +5,10 @@ CFLAGS_REMOVE_time.o = -pg CFLAGS_REMOVE_irq.o = -pg endif +# Make sure early boot has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_enlighten.o := $(nostackp) + obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0a1700a..e90540a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -974,10 +974,6 @@ asmlinkage void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; - BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); - - xen_setup_features(); - /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; @@ -986,8 +982,15 @@ asmlinkage void __init xen_start_kernel(void) pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; - xen_init_irq_ops(); +#ifdef CONFIG_X86_64 + /* + * Setup percpu state. We only need to do this for 64-bit + * because 32-bit already has %fs set properly. + */ + load_percpu_segment(0); +#endif + xen_init_irq_ops(); xen_init_cpuid_mask(); #ifdef CONFIG_X86_LOCAL_APIC @@ -997,6 +1000,8 @@ asmlinkage void __init xen_start_kernel(void) set_xen_basic_apic_ops(); #endif + xen_setup_features(); + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; @@ -1004,13 +1009,6 @@ asmlinkage void __init xen_start_kernel(void) machine_ops = xen_machine_ops; -#ifdef CONFIG_X86_64 - /* - * Setup percpu state. We only need to do this for 64-bit - * because 32-bit already has %fs set properly. - */ - load_percpu_segment(0); -#endif /* * The only reliable way to retain the initial address of the * percpu gdt_page is to remember it here, so we can go and |