From d751c169e9a6f0f853346f1184881422bd10b3c2 Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Thu, 10 Oct 2013 18:39:54 -0700 Subject: x86, relocs: Add more per-cpu gold special cases The "gold" linker doesn't seem to put some additional per-cpu cases in the right place. Add these to the per-cpu check. Without this, the kASLR patch series fails to correctly apply relocations, and fails to boot. Signed-off-by: Michael Davidson Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20131011013954.GA28902@www.outflux.net Signed-off-by: H. Peter Anvin --- arch/x86/tools/relocs.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index f7bab68..71a2533 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -722,15 +722,23 @@ static void percpu_init(void) /* * Check to see if a symbol lies in the .data..percpu section. - * For some as yet not understood reason the "__init_begin" - * symbol which immediately preceeds the .data..percpu section - * also shows up as it it were part of it so we do an explict - * check for that symbol name and ignore it. + * + * The linker incorrectly associates some symbols with the + * .data..percpu section so we also need to check the symbol + * name to make sure that we classify the symbol correctly. + * + * The GNU linker incorrectly associates: + * __init_begin + * + * The "gold" linker incorrectly associates: + * init_per_cpu__irq_stack_union + * init_per_cpu__gdt_page */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) { return (sym->st_shndx == per_cpu_shndx) && - strcmp(symname, "__init_begin"); + strcmp(symname, "__init_begin") && + strncmp(symname, "init_per_cpu_", 13); } -- cgit v1.1 From dd78b97367bd575918204cc89107c1479d3fc1a7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:13 -0700 Subject: x86, boot: Move CPU flags out of cpucheck Refactor the CPU flags handling out of the cpucheck routines so that they can be reused by the future ASLR routines (in order to detect CPU features like RDRAND and RDTSC). This reworks has_eflag() and has_fpu() to be used on both 32-bit and 64-bit, and refactors the calls to cpuid to make them PIC-safe on 32-bit. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-2-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 2 +- arch/x86/boot/boot.h | 10 +--- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/cpuflags.c | 12 +++++ arch/x86/boot/cpucheck.c | 86 ----------------------------- arch/x86/boot/cpuflags.c | 104 ++++++++++++++++++++++++++++++++++++ arch/x86/boot/cpuflags.h | 19 +++++++ 7 files changed, 138 insertions(+), 97 deletions(-) create mode 100644 arch/x86/boot/compressed/cpuflags.c create mode 100644 arch/x86/boot/cpuflags.c create mode 100644 arch/x86/boot/cpuflags.h (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 379814b..0da2e37 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -20,7 +20,7 @@ targets := vmlinux.bin setup.bin setup.elf bzImage targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed -setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o +setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o setup-y += video-mode.o version.o diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index ef72bae..50f8c5e 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -26,9 +26,8 @@ #include #include #include "bitops.h" -#include -#include #include "ctype.h" +#include "cpuflags.h" /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -307,14 +306,7 @@ static inline int cmdline_find_option_bool(const char *option) return __cmdline_find_option_bool(cmd_line_ptr, option); } - /* cpu.c, cpucheck.c */ -struct cpu_features { - int level; /* Family, or 64 for x86-64 */ - int model; - u32 flags[NCAPINTS]; -}; -extern struct cpu_features cpu; int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); int validate_cpu(void); diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index dcd90df..3312f1be 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -27,7 +27,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ - $(obj)/piggy.o + $(obj)/piggy.o $(obj)/cpuflags.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c new file mode 100644 index 0000000..931cba6 --- /dev/null +++ b/arch/x86/boot/compressed/cpuflags.c @@ -0,0 +1,12 @@ +#ifdef CONFIG_RANDOMIZE_BASE + +#include "../cpuflags.c" + +bool has_cpuflag(int flag) +{ + get_flags(); + + return test_bit(flag, cpu.flags); +} + +#endif diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4d3ff03..e1f3c16 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -28,8 +28,6 @@ #include #include -struct cpu_features cpu; -static u32 cpu_vendor[3]; static u32 err_flags[NCAPINTS]; static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY; @@ -69,90 +67,6 @@ static int is_transmeta(void) cpu_vendor[2] == A32('M', 'x', '8', '6'); } -static int has_fpu(void) -{ - u16 fcw = -1, fsw = -1; - u32 cr0; - - asm("movl %%cr0,%0" : "=r" (cr0)); - if (cr0 & (X86_CR0_EM|X86_CR0_TS)) { - cr0 &= ~(X86_CR0_EM|X86_CR0_TS); - asm volatile("movl %0,%%cr0" : : "r" (cr0)); - } - - asm volatile("fninit ; fnstsw %0 ; fnstcw %1" - : "+m" (fsw), "+m" (fcw)); - - return fsw == 0 && (fcw & 0x103f) == 0x003f; -} - -static int has_eflag(u32 mask) -{ - u32 f0, f1; - - asm("pushfl ; " - "pushfl ; " - "popl %0 ; " - "movl %0,%1 ; " - "xorl %2,%1 ; " - "pushl %1 ; " - "popfl ; " - "pushfl ; " - "popl %1 ; " - "popfl" - : "=&r" (f0), "=&r" (f1) - : "ri" (mask)); - - return !!((f0^f1) & mask); -} - -static void get_flags(void) -{ - u32 max_intel_level, max_amd_level; - u32 tfms; - - if (has_fpu()) - set_bit(X86_FEATURE_FPU, cpu.flags); - - if (has_eflag(X86_EFLAGS_ID)) { - asm("cpuid" - : "=a" (max_intel_level), - "=b" (cpu_vendor[0]), - "=d" (cpu_vendor[1]), - "=c" (cpu_vendor[2]) - : "a" (0)); - - if (max_intel_level >= 0x00000001 && - max_intel_level <= 0x0000ffff) { - asm("cpuid" - : "=a" (tfms), - "=c" (cpu.flags[4]), - "=d" (cpu.flags[0]) - : "a" (0x00000001) - : "ebx"); - cpu.level = (tfms >> 8) & 15; - cpu.model = (tfms >> 4) & 15; - if (cpu.level >= 6) - cpu.model += ((tfms >> 16) & 0xf) << 4; - } - - asm("cpuid" - : "=a" (max_amd_level) - : "a" (0x80000000) - : "ebx", "ecx", "edx"); - - if (max_amd_level >= 0x80000001 && - max_amd_level <= 0x8000ffff) { - u32 eax = 0x80000001; - asm("cpuid" - : "+a" (eax), - "=c" (cpu.flags[6]), - "=d" (cpu.flags[1]) - : : "ebx"); - } - } -} - /* Returns a bitmask of which words we have error bits in */ static int check_flags(void) { diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c new file mode 100644 index 0000000..b02544a --- /dev/null +++ b/arch/x86/boot/cpuflags.c @@ -0,0 +1,104 @@ +#include +#include "bitops.h" + +#include +#include +#include +#include "cpuflags.h" + +struct cpu_features cpu; +u32 cpu_vendor[3]; + +static bool loaded_flags; + +static int has_fpu(void) +{ + u16 fcw = -1, fsw = -1; + unsigned long cr0; + + asm volatile("mov %%cr0,%0" : "=r" (cr0)); + if (cr0 & (X86_CR0_EM|X86_CR0_TS)) { + cr0 &= ~(X86_CR0_EM|X86_CR0_TS); + asm volatile("mov %0,%%cr0" : : "r" (cr0)); + } + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); + + return fsw == 0 && (fcw & 0x103f) == 0x003f; +} + +int has_eflag(unsigned long mask) +{ + unsigned long f0, f1; + + asm volatile("pushf \n\t" + "pushf \n\t" + "pop %0 \n\t" + "mov %0,%1 \n\t" + "xor %2,%1 \n\t" + "push %1 \n\t" + "popf \n\t" + "pushf \n\t" + "pop %1 \n\t" + "popf" + : "=&r" (f0), "=&r" (f1) + : "ri" (mask)); + + return !!((f0^f1) & mask); +} + +/* Handle x86_32 PIC using ebx. */ +#if defined(__i386__) && defined(__PIC__) +# define EBX_REG "=r" +#else +# define EBX_REG "=b" +#endif + +static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d) +{ + asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" + "cpuid \n\t" + ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" + : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) + : "a" (id) + ); +} + +void get_flags(void) +{ + u32 max_intel_level, max_amd_level; + u32 tfms; + u32 ignored; + + if (loaded_flags) + return; + loaded_flags = true; + + if (has_fpu()) + set_bit(X86_FEATURE_FPU, cpu.flags); + + if (has_eflag(X86_EFLAGS_ID)) { + cpuid(0x0, &max_intel_level, &cpu_vendor[0], &cpu_vendor[2], + &cpu_vendor[1]); + + if (max_intel_level >= 0x00000001 && + max_intel_level <= 0x0000ffff) { + cpuid(0x1, &tfms, &ignored, &cpu.flags[4], + &cpu.flags[0]); + cpu.level = (tfms >> 8) & 15; + cpu.model = (tfms >> 4) & 15; + if (cpu.level >= 6) + cpu.model += ((tfms >> 16) & 0xf) << 4; + } + + cpuid(0x80000000, &max_amd_level, &ignored, &ignored, + &ignored); + + if (max_amd_level >= 0x80000001 && + max_amd_level <= 0x8000ffff) { + cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6], + &cpu.flags[1]); + } + } +} diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h new file mode 100644 index 0000000..9bb4e25 --- /dev/null +++ b/arch/x86/boot/cpuflags.h @@ -0,0 +1,19 @@ +#ifndef BOOT_CPUFLAGS_H +#define BOOT_CPUFLAGS_H + +#include +#include + +struct cpu_features { + int level; /* Family, or 64 for x86-64 */ + int model; + u32 flags[NCAPINTS]; +}; + +extern struct cpu_features cpu; +extern u32 cpu_vendor[3]; + +int has_eflag(unsigned long mask); +void get_flags(void); + +#endif -- cgit v1.1 From 8ab3820fd5b2896d66da7bb2a906bc382e63e7bc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:14 -0700 Subject: x86, kaslr: Return location from decompress_kernel This allows decompress_kernel to return a new location for the kernel to be relocated to. Additionally, enforces CONFIG_PHYSICAL_START as the minimum relocation position when building with CONFIG_RELOCATABLE. With CONFIG_RANDOMIZE_BASE set, the choose_kernel_location routine will select a new location to decompress the kernel, though here it is presently a no-op. The kernel command line option "nokaslr" is introduced to bypass these routines. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-3-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 38 ++++++++++++++++++++++++++++++++++---- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/aslr.c | 23 +++++++++++++++++++++++ arch/x86/boot/compressed/cmdline.c | 2 +- arch/x86/boot/compressed/head_32.S | 10 ++++++---- arch/x86/boot/compressed/head_64.S | 16 ++++++++++------ arch/x86/boot/compressed/misc.c | 8 ++++++-- arch/x86/boot/compressed/misc.h | 27 +++++++++++++++++++++------ 8 files changed, 102 insertions(+), 24 deletions(-) create mode 100644 arch/x86/boot/compressed/aslr.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ee2fb9d..992701d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1722,16 +1722,46 @@ config RELOCATABLE Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address it has been loaded at and the compile time physical address - (CONFIG_PHYSICAL_START) is ignored. + (CONFIG_PHYSICAL_START) is used as the minimum location. -# Relocation on x86-32 needs some additional build support +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + depends on RELOCATABLE + depends on !HIBERNATION + default n + ---help--- + Randomizes the physical and virtual address at which the + kernel image is decompressed, as a security feature that + deters exploit attempts relying on knowledge of the location + of kernel internals. + + Entropy is generated using the RDRAND instruction if it + is supported. If not, then RDTSC is used, if supported. If + neither RDRAND nor RDTSC are supported, then no randomness + is introduced. + + The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, + and aligned according to PHYSICAL_ALIGN. + +config RANDOMIZE_BASE_MAX_OFFSET + hex "Maximum ASLR offset allowed" + depends on RANDOMIZE_BASE + default "0x10000000" + range 0x0 0x10000000 + ---help--- + Determines the maximal offset in bytes that will be applied to the + kernel when Address Space Layout Randomization (ASLR) is active. + Must be less than or equal to the actual physical memory on the + system. This must be a power of two. + +# Relocation on x86 needs some additional build support config X86_NEED_RELOCS def_bool y - depends on X86_32 && RELOCATABLE + depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE) config PHYSICAL_ALIGN hex "Alignment value to which kernel should be aligned" - default "0x1000000" + default "0x200000" range 0x2000 0x1000000 if X86_32 range 0x200000 0x1000000 if X86_64 ---help--- diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3312f1be..ae8b5db 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -27,7 +27,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ - $(obj)/piggy.o $(obj)/cpuflags.o + $(obj)/piggy.o $(obj)/cpuflags.o $(obj)/aslr.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c new file mode 100644 index 0000000..b73cc66 --- /dev/null +++ b/arch/x86/boot/compressed/aslr.c @@ -0,0 +1,23 @@ +#include "misc.h" + +#ifdef CONFIG_RANDOMIZE_BASE + +unsigned char *choose_kernel_location(unsigned char *input, + unsigned long input_size, + unsigned char *output, + unsigned long output_size) +{ + unsigned long choice = (unsigned long)output; + + if (cmdline_find_option_bool("nokaslr")) { + debug_putstr("KASLR disabled...\n"); + goto out; + } + + /* XXX: choose random location. */ + +out: + return (unsigned char *)choice; +} + +#endif /* CONFIG_RANDOMIZE_BASE */ diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index bffd73b..b68e303 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,6 +1,6 @@ #include "misc.h" -#ifdef CONFIG_EARLY_PRINTK +#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE static unsigned long fs; static inline void set_fs(unsigned long seg) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 5d6f6891..9116aac 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -117,9 +117,11 @@ preferred_addr: addl %eax, %ebx notl %eax andl %eax, %ebx -#else - movl $LOAD_PHYSICAL_ADDR, %ebx + cmpl $LOAD_PHYSICAL_ADDR, %ebx + jge 1f #endif + movl $LOAD_PHYSICAL_ADDR, %ebx +1: /* Target address to relocate to for decompression */ addl $z_extract_offset, %ebx @@ -191,14 +193,14 @@ relocated: leal boot_heap(%ebx), %eax pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call decompress_kernel + call decompress_kernel /* returns kernel location in %eax */ addl $24, %esp /* * Jump to the decompressed kernel. */ xorl %ebx, %ebx - jmp *%ebp + jmp *%eax /* * Stack and heap for uncompression diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index c337422..c5c1ae0 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -94,9 +94,11 @@ ENTRY(startup_32) addl %eax, %ebx notl %eax andl %eax, %ebx -#else - movl $LOAD_PHYSICAL_ADDR, %ebx + cmpl $LOAD_PHYSICAL_ADDR, %ebx + jge 1f #endif + movl $LOAD_PHYSICAL_ADDR, %ebx +1: /* Target address to relocate to for decompression */ addl $z_extract_offset, %ebx @@ -269,9 +271,11 @@ preferred_addr: addq %rax, %rbp notq %rax andq %rax, %rbp -#else - movq $LOAD_PHYSICAL_ADDR, %rbp + cmpq $LOAD_PHYSICAL_ADDR, %rbp + jge 1f #endif + movq $LOAD_PHYSICAL_ADDR, %rbp +1: /* Target address to relocate to for decompression */ leaq z_extract_offset(%rbp), %rbx @@ -339,13 +343,13 @@ relocated: movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movq $z_output_len, %r9 /* decompressed length */ - call decompress_kernel + call decompress_kernel /* returns kernel location in %rax */ popq %rsi /* * Jump to the decompressed kernel. */ - jmp *%rbp + jmp *%rax .code32 no_longmode: diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 434f077..7138768 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -395,7 +395,7 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage void decompress_kernel(void *rmode, memptr heap, +asmlinkage void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, @@ -422,6 +422,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + output = choose_kernel_location(input_data, input_len, + output, output_len); + + /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) error("Destination address inappropriately aligned"); #ifdef CONFIG_X86_64 @@ -441,5 +445,5 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, parse_elf(output); handle_relocations(output, output_len); debug_putstr("done.\nBooting the kernel.\n"); - return; + return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 674019d..9077af7 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -39,23 +39,38 @@ static inline void debug_putstr(const char *s) #endif -#ifdef CONFIG_EARLY_PRINTK - +#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); +#endif -/* early_serial_console.c */ -extern int early_serial_base; -void console_init(void); +#if CONFIG_RANDOMIZE_BASE +/* aslr.c */ +unsigned char *choose_kernel_location(unsigned char *input, + unsigned long input_size, + unsigned char *output, + unsigned long output_size); #else +static inline +unsigned char *choose_kernel_location(unsigned char *input, + unsigned long input_size, + unsigned char *output, + unsigned long output_size) +{ + return output; +} +#endif +#ifdef CONFIG_EARLY_PRINTK /* early_serial_console.c */ +extern int early_serial_base; +void console_init(void); +#else static const int early_serial_base; static inline void console_init(void) { } - #endif #endif -- cgit v1.1 From 5bfce5ef55cbe78ee2ee6e97f2e26a8a582008f3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:15 -0700 Subject: x86, kaslr: Provide randomness functions Adds potential sources of randomness: RDRAND, RDTSC, or the i8254. This moves the pre-alternatives inline rdrand function into the header so both pieces of code can use it. Availability of RDRAND is then controlled by CONFIG_ARCH_RANDOM, if someone wants to disable it even for kASLR. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-4-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/aslr.c | 53 +++++++++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/misc.h | 2 ++ arch/x86/include/asm/archrandom.h | 21 ++++++++++++++++ arch/x86/kernel/cpu/rdrand.c | 14 ----------- 4 files changed, 76 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index b73cc66..14b24e0 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -1,6 +1,59 @@ #include "misc.h" #ifdef CONFIG_RANDOMIZE_BASE +#include +#include + +#define I8254_PORT_CONTROL 0x43 +#define I8254_PORT_COUNTER0 0x40 +#define I8254_CMD_READBACK 0xC0 +#define I8254_SELECT_COUNTER0 0x02 +#define I8254_STATUS_NOTREADY 0x40 +static inline u16 i8254(void) +{ + u16 status, timer; + + do { + outb(I8254_PORT_CONTROL, + I8254_CMD_READBACK | I8254_SELECT_COUNTER0); + status = inb(I8254_PORT_COUNTER0); + timer = inb(I8254_PORT_COUNTER0); + timer |= inb(I8254_PORT_COUNTER0) << 8; + } while (status & I8254_STATUS_NOTREADY); + + return timer; +} + +static unsigned long get_random_long(void) +{ + unsigned long random; + + if (has_cpuflag(X86_FEATURE_RDRAND)) { + debug_putstr("KASLR using RDRAND...\n"); + if (rdrand_long(&random)) + return random; + } + + if (has_cpuflag(X86_FEATURE_TSC)) { + uint32_t raw; + + debug_putstr("KASLR using RDTSC...\n"); + rdtscl(raw); + + /* Only use the low bits of rdtsc. */ + random = raw & 0xffff; + } else { + debug_putstr("KASLR using i8254...\n"); + random = i8254(); + } + + /* Extend timer bits poorly... */ + random |= (random << 16); +#ifdef CONFIG_X86_64 + random |= (random << 32); +#endif + return random; +} unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9077af7..0782eb0 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -52,6 +52,8 @@ unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); +/* cpuflags.c */ +bool has_cpuflag(int flag); #else static inline unsigned char *choose_kernel_location(unsigned char *input, diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 0d9ec77..e6a9245 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -39,6 +39,20 @@ #ifdef CONFIG_ARCH_RANDOM +/* Instead of arch_get_random_long() when alternatives haven't run. */ +static inline int rdrand_long(unsigned long *v) +{ + int ok; + asm volatile("1: " RDRAND_LONG "\n\t" + "jc 2f\n\t" + "decl %0\n\t" + "jnz 1b\n\t" + "2:" + : "=r" (ok), "=a" (*v) + : "0" (RDRAND_RETRY_LOOPS)); + return ok; +} + #define GET_RANDOM(name, type, rdrand, nop) \ static inline int name(type *v) \ { \ @@ -68,6 +82,13 @@ GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); #endif /* CONFIG_X86_64 */ +#else + +static inline int rdrand_long(unsigned long *v) +{ + return 0; +} + #endif /* CONFIG_ARCH_RANDOM */ extern void x86_init_rdrand(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 88db010..384df51 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s) } __setup("nordrand", x86_rdrand_setup); -/* We can't use arch_get_random_long() here since alternatives haven't run */ -static inline int rdrand_long(unsigned long *v) -{ - int ok; - asm volatile("1: " RDRAND_LONG "\n\t" - "jc 2f\n\t" - "decl %0\n\t" - "jnz 1b\n\t" - "2:" - : "=r" (ok), "=a" (*v) - : "0" (RDRAND_RETRY_LOOPS)); - return ok; -} - /* * Force a reseed cycle; we are architecturally guaranteed a reseed * after no more than 512 128-bit chunks of random data. This also -- cgit v1.1 From 82fa9637a2ba285bcc7c5050c73010b2c1b3d803 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:16 -0700 Subject: x86, kaslr: Select random position from e820 maps Counts available alignment positions across all e820 maps, and chooses one randomly for the new kernel base address, making sure not to collide with unsafe memory areas. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-5-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/aslr.c | 193 +++++++++++++++++++++++++++++++++++++++- arch/x86/boot/compressed/misc.c | 10 +-- arch/x86/boot/compressed/misc.h | 8 ++ 3 files changed, 202 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 14b24e0..0595798 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -3,6 +3,7 @@ #ifdef CONFIG_RANDOMIZE_BASE #include #include +#include #define I8254_PORT_CONTROL 0x43 #define I8254_PORT_COUNTER0 0x40 @@ -55,20 +56,210 @@ static unsigned long get_random_long(void) return random; } +struct mem_vector { + unsigned long start; + unsigned long size; +}; + +#define MEM_AVOID_MAX 5 +struct mem_vector mem_avoid[MEM_AVOID_MAX]; + +static bool mem_contains(struct mem_vector *region, struct mem_vector *item) +{ + /* Item at least partially before region. */ + if (item->start < region->start) + return false; + /* Item at least partially after region. */ + if (item->start + item->size > region->start + region->size) + return false; + return true; +} + +static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) +{ + /* Item one is entirely before item two. */ + if (one->start + one->size <= two->start) + return false; + /* Item one is entirely after item two. */ + if (one->start >= two->start + two->size) + return false; + return true; +} + +static void mem_avoid_init(unsigned long input, unsigned long input_size, + unsigned long output, unsigned long output_size) +{ + u64 initrd_start, initrd_size; + u64 cmd_line, cmd_line_size; + unsigned long unsafe, unsafe_len; + char *ptr; + + /* + * Avoid the region that is unsafe to overlap during + * decompression (see calculations at top of misc.c). + */ + unsafe_len = (output_size >> 12) + 32768 + 18; + unsafe = (unsigned long)input + input_size - unsafe_len; + mem_avoid[0].start = unsafe; + mem_avoid[0].size = unsafe_len; + + /* Avoid initrd. */ + initrd_start = (u64)real_mode->ext_ramdisk_image << 32; + initrd_start |= real_mode->hdr.ramdisk_image; + initrd_size = (u64)real_mode->ext_ramdisk_size << 32; + initrd_size |= real_mode->hdr.ramdisk_size; + mem_avoid[1].start = initrd_start; + mem_avoid[1].size = initrd_size; + + /* Avoid kernel command line. */ + cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line |= real_mode->hdr.cmd_line_ptr; + /* Calculate size of cmd_line. */ + ptr = (char *)(unsigned long)cmd_line; + for (cmd_line_size = 0; ptr[cmd_line_size++]; ) + ; + mem_avoid[2].start = cmd_line; + mem_avoid[2].size = cmd_line_size; + + /* Avoid heap memory. */ + mem_avoid[3].start = (unsigned long)free_mem_ptr; + mem_avoid[3].size = BOOT_HEAP_SIZE; + + /* Avoid stack memory. */ + mem_avoid[4].start = (unsigned long)free_mem_end_ptr; + mem_avoid[4].size = BOOT_STACK_SIZE; +} + +/* Does this memory vector overlap a known avoided area? */ +bool mem_avoid_overlap(struct mem_vector *img) +{ + int i; + + for (i = 0; i < MEM_AVOID_MAX; i++) { + if (mem_overlaps(img, &mem_avoid[i])) + return true; + } + + return false; +} + +unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN]; +unsigned long slot_max = 0; + +static void slots_append(unsigned long addr) +{ + /* Overflowing the slots list should be impossible. */ + if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / + CONFIG_PHYSICAL_ALIGN) + return; + + slots[slot_max++] = addr; +} + +static unsigned long slots_fetch_random(void) +{ + /* Handle case of no slots stored. */ + if (slot_max == 0) + return 0; + + return slots[get_random_long() % slot_max]; +} + +static void process_e820_entry(struct e820entry *entry, + unsigned long minimum, + unsigned long image_size) +{ + struct mem_vector region, img; + + /* Skip non-RAM entries. */ + if (entry->type != E820_RAM) + return; + + /* Ignore entries entirely above our maximum. */ + if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) + return; + + /* Ignore entries entirely below our minimum. */ + if (entry->addr + entry->size < minimum) + return; + + region.start = entry->addr; + region.size = entry->size; + + /* Potentially raise address to minimum location. */ + if (region.start < minimum) + region.start = minimum; + + /* Potentially raise address to meet alignment requirements. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + + /* Did we raise the address above the bounds of this e820 region? */ + if (region.start > entry->addr + entry->size) + return; + + /* Reduce size by any delta from the original address. */ + region.size -= region.start - entry->addr; + + /* Reduce maximum size to fit end of image within maximum limit. */ + if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) + region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; + + /* Walk each aligned slot and check for avoided areas. */ + for (img.start = region.start, img.size = image_size ; + mem_contains(®ion, &img) ; + img.start += CONFIG_PHYSICAL_ALIGN) { + if (mem_avoid_overlap(&img)) + continue; + slots_append(img.start); + } +} + +static unsigned long find_random_addr(unsigned long minimum, + unsigned long size) +{ + int i; + unsigned long addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < real_mode->e820_entries; i++) { + process_e820_entry(&real_mode->e820_map[i], minimum, size); + } + + return slots_fetch_random(); +} + unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) { unsigned long choice = (unsigned long)output; + unsigned long random; if (cmdline_find_option_bool("nokaslr")) { debug_putstr("KASLR disabled...\n"); goto out; } - /* XXX: choose random location. */ + /* Record the various known unsafe memory ranges. */ + mem_avoid_init((unsigned long)input, input_size, + (unsigned long)output, output_size); + + /* Walk e820 and find a random address. */ + random = find_random_addr(choice, output_size); + if (!random) { + debug_putstr("KASLR could not find suitable E820 region...\n"); + goto out; + } + + /* Always enforce the minimum. */ + if (random < choice) + goto out; + choice = random; out: return (unsigned char *)choice; } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7138768..196eaf3 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -112,14 +112,8 @@ struct boot_params *real_mode; /* Pointer to real-mode data */ void *memset(void *s, int c, size_t n); void *memcpy(void *dest, const void *src, size_t n); -#ifdef CONFIG_X86_64 -#define memptr long -#else -#define memptr unsigned -#endif - -static memptr free_mem_ptr; -static memptr free_mem_end_ptr; +memptr free_mem_ptr; +memptr free_mem_end_ptr; static char *vidmem; static int vidport; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0782eb0..24e3e56 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,7 +23,15 @@ #define BOOT_BOOT_H #include "../ctype.h" +#ifdef CONFIG_X86_64 +#define memptr long +#else +#define memptr unsigned +#endif + /* misc.c */ +extern memptr free_mem_ptr; +extern memptr free_mem_end_ptr; extern struct boot_params *real_mode; /* Pointer to real-mode data */ void __putstr(const char *s); #define error_putstr(__x) __putstr(__x) -- cgit v1.1 From f32360ef6608434a032dc7ad262d45e9693c27f3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:17 -0700 Subject: x86, kaslr: Report kernel offset on panic When the system panics, include the kernel offset in the report to assist in debugging. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-6-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f0de629..1708862 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -824,6 +824,20 @@ static void __init trim_low_memory_range(void) } /* + * Dump out kernel offset information on panic. + */ +static int +dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) +{ + pr_emerg("Kernel Offset: 0x%lx from 0x%lx " + "(relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, __START_KERNEL, + __START_KERNEL_map, MODULES_VADDR-1); + + return 0; +} + +/* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures * for initialization. Note, the efi init code path is determined by the @@ -1242,3 +1256,15 @@ void __init i386_reserve_resources(void) } #endif /* CONFIG_X86_32 */ + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + return 0; +} +__initcall(register_kernel_offset_dumper); -- cgit v1.1 From 6145cfe394a7f138f6b64491c5663f97dba12450 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:18 -0700 Subject: x86, kaslr: Raise the maximum virtual address to -1 GiB on x86_64 On 64-bit, this raises the maximum location to -1 GiB (from -1.5 GiB), the upper limit currently, since the kernel fixmap page mappings need to be moved to use the other 1 GiB (which would be the theoretical limit when building with -mcmodel=kernel). Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-7-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 16 +++++++++++++--- arch/x86/include/asm/page_64_types.h | 15 ++++++++++++--- arch/x86/include/asm/pgtable_64_types.h | 2 +- arch/x86/mm/init_32.c | 3 +++ 4 files changed, 29 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 992701d..51f4399 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1746,13 +1746,23 @@ config RANDOMIZE_BASE config RANDOMIZE_BASE_MAX_OFFSET hex "Maximum ASLR offset allowed" depends on RANDOMIZE_BASE - default "0x10000000" - range 0x0 0x10000000 + range 0x0 0x20000000 if X86_32 + default "0x20000000" if X86_32 + range 0x0 0x40000000 if X86_64 + default "0x40000000" if X86_64 ---help--- Determines the maximal offset in bytes that will be applied to the kernel when Address Space Layout Randomization (ASLR) is active. Must be less than or equal to the actual physical memory on the - system. This must be a power of two. + system. This must be a multiple of CONFIG_PHYSICAL_ALIGN. + + On 32-bit this is limited to 512MiB. + + On 64-bit this is limited by how the kernel fixmap page table is + positioned, so this cannot be larger that 1GiB currently. Normally + there is a 512MiB to 1.5GiB split between kernel and modules. When + this is raised above the 512MiB default, the modules area will + shrink to compensate, up to the current maximum 1GiB to 1GiB split. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 43dcd80..8de6d9c 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -39,9 +39,18 @@ #define __VIRTUAL_MASK_SHIFT 47 /* - * Kernel image size is limited to 512 MB (see level2_kernel_pgt in - * arch/x86/kernel/head_64.S), and it is mapped here: + * Kernel image size is limited to 1GiB due to the fixmap living in the + * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use + * 512MiB by default, leaving 1.5GiB for modules once the page tables + * are fully set up. If kernel ASLR is configured, it can extend the + * kernel page table mapping, reducing the size of the modules area. */ -#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) +#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) +#if defined(CONFIG_RANDOMIZE_BASE) && \ + CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT +#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET +#else +#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT +#endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 2d88344..c883bf7 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -58,7 +58,7 @@ typedef struct { pteval_t pte; } pte_t; #define VMALLOC_START _AC(0xffffc90000000000, UL) #define VMALLOC_END _AC(0xffffe8ffffffffff, UL) #define VMEMMAP_START _AC(0xffffea0000000000, UL) -#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 4287f1f..5bdc543 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -806,6 +806,9 @@ void __init mem_init(void) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP +#ifdef CONFIG_RANDOMIZE_BASE + BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); +#endif #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); -- cgit v1.1 From 6e6a4932b0f569b1a5bb4fcbf5dde1b1a42f01bb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 13 Oct 2013 04:08:56 -0700 Subject: x86, boot: Rename get_flags() and check_flags() to *_cpuflags() When a function is used in more than one file it may not be possible to immediately tell from context what the intended meaning is. As such, it is more important that the naming be self-evident. Thus, change get_flags() to get_cpuflags(). For consistency, change check_flags() to check_cpuflags() even though it is only used in cpucheck.c. Link: http://lkml.kernel.org/r/1381450698-28710-2-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/cpuflags.c | 2 +- arch/x86/boot/cpucheck.c | 14 +++++++------- arch/x86/boot/cpuflags.c | 2 +- arch/x86/boot/cpuflags.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c index 931cba6..aa31346 100644 --- a/arch/x86/boot/compressed/cpuflags.c +++ b/arch/x86/boot/compressed/cpuflags.c @@ -4,7 +4,7 @@ bool has_cpuflag(int flag) { - get_flags(); + get_cpuflags(); return test_bit(flag, cpu.flags); } diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index e1f3c16..100a9a1 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -68,7 +68,7 @@ static int is_transmeta(void) } /* Returns a bitmask of which words we have error bits in */ -static int check_flags(void) +static int check_cpuflags(void) { u32 err; int i; @@ -101,8 +101,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) if (has_eflag(X86_EFLAGS_AC)) cpu.level = 4; - get_flags(); - err = check_flags(); + get_cpuflags(); + err = check_cpuflags(); if (test_bit(X86_FEATURE_LM, cpu.flags)) cpu.level = 64; @@ -121,8 +121,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) eax &= ~(1 << 15); asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); - get_flags(); /* Make sure it really did something */ - err = check_flags(); + get_cpuflags(); /* Make sure it really did something */ + err = check_cpuflags(); } else if (err == 0x01 && !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) && is_centaur() && cpu.model >= 6) { @@ -137,7 +137,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); set_bit(X86_FEATURE_CX8, cpu.flags); - err = check_flags(); + err = check_cpuflags(); } else if (err == 0x01 && is_transmeta()) { /* Transmeta might have masked feature bits in word 0 */ @@ -152,7 +152,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) : : "ecx", "ebx"); asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); - err = check_flags(); + err = check_cpuflags(); } if (err_flags_ptr) diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index b02544a..a9fcb7c 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -65,7 +65,7 @@ static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d) ); } -void get_flags(void) +void get_cpuflags(void) { u32 max_intel_level, max_amd_level; u32 tfms; diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 9bb4e25..ea97697 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -14,6 +14,6 @@ extern struct cpu_features cpu; extern u32 cpu_vendor[3]; int has_eflag(unsigned long mask); -void get_flags(void); +void get_cpuflags(void); #endif -- cgit v1.1 From aec58bafaf89279522c44ec8ca9211eabb2b6976 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Oct 2013 23:43:14 -0700 Subject: x86/relocs: Add percpu fixup for GNU ld 2.23 The GNU linker tries to put __per_cpu_load into the percpu area, resulting in a lack of its relocation. Force this symbol to be relocated. Seen starting with GNU ld 2.23 and later. Reported-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Michael Davidson Cc: Cong Ding Link: http://lkml.kernel.org/r/20131016064314.GA2739@www.outflux.net Signed-off-by: Ingo Molnar --- arch/x86/tools/relocs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 71a2533..11f9285 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -729,6 +729,7 @@ static void percpu_init(void) * * The GNU linker incorrectly associates: * __init_begin + * __per_cpu_load * * The "gold" linker incorrectly associates: * init_per_cpu__irq_stack_union @@ -738,6 +739,7 @@ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) { return (sym->st_shndx == per_cpu_shndx) && strcmp(symname, "__init_begin") && + strcmp(symname, "__per_cpu_load") && strncmp(symname, "init_per_cpu_", 13); } -- cgit v1.1 From f4fccac05f7f6bacb8e481a84d175e85ffcf9fe2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:24:59 +0100 Subject: x86/efi: Simplify EFI_DEBUG ... and lose one #ifdef .. #endif sandwich. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1d3372a..f396163 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -51,7 +51,7 @@ #include #include -#define EFI_DEBUG 1 +#define EFI_DEBUG #define EFI_MIN_RESERVE 5120 @@ -398,9 +398,9 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } -#if EFI_DEBUG static void __init print_efi_memmap(void) { +#ifdef EFI_DEBUG efi_memory_desc_t *md; void *p; int i; @@ -415,8 +415,8 @@ static void __init print_efi_memmap(void) md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } -} #endif /* EFI_DEBUG */ +} void __init efi_reserve_boot_services(void) { @@ -696,10 +696,7 @@ void __init efi_init(void) x86_platform.set_wallclock = efi_set_rtc_mmss; } #endif - -#if EFI_DEBUG print_efi_memmap(); -#endif } void __init efi_late_init(void) -- cgit v1.1 From 0fd64c23fdf556e9e68580cff03b3505797bbf53 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:00 +0100 Subject: x86/mm/pageattr: Lookup address in an arbitrary PGD This is preparatory work in order to be able to map pages into a specified PGD and not implicitly and only into init_mm. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bb32480..c53de62 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -30,6 +30,7 @@ */ struct cpa_data { unsigned long *vaddr; + pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; int numpages; @@ -322,17 +323,9 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, return prot; } -/* - * Lookup the page table entry for a virtual address. Return a pointer - * to the entry and the level of the mapping. - * - * Note: We return pud and pmd either when the entry is marked large - * or when the present bit is not set. Otherwise we would return a - * pointer to a nonexisting mapping. - */ -pte_t *lookup_address(unsigned long address, unsigned int *level) +static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address, + unsigned int *level) { - pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; @@ -361,8 +354,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) return pte_offset_kernel(pmd, address); } + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + return __lookup_address_in_pgd(pgd_offset_k(address), address, level); +} EXPORT_SYMBOL_GPL(lookup_address); +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, + unsigned int *level) +{ + if (cpa->pgd) + return __lookup_address_in_pgd(cpa->pgd + pgd_index(address), + address, level); + + return lookup_address(address, level); +} + /* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() -- cgit v1.1 From f3f729661e8db476ac427a97de015307aebb7404 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:01 +0100 Subject: x86/mm/pageattr: Add a PGD pagetable populating function This allocates, if necessary, and populates the corresponding PGD entry with a PUD page. The next population level is a dummy macro which will be removed by the next patch and it is added here to keep the patch small and easily reviewable but not break bisection, at the same time. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c53de62..4b47ae0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,6 +666,45 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } +#define populate_pud(cpa, addr, pgd, pgprot) (-1) + +/* + * Restrictions for kernel page table do not necessarily apply when mapping in + * an alternate PGD. + */ +static int populate_pgd(struct cpa_data *cpa, unsigned long addr) +{ + pgprot_t pgprot = __pgprot(_KERNPG_TABLE); + bool allocd_pgd = false; + pgd_t *pgd_entry; + pud_t *pud = NULL; /* shut up gcc */ + int ret; + + pgd_entry = cpa->pgd + pgd_index(addr); + + /* + * Allocate a PUD page and hand it down for mapping. + */ + if (pgd_none(*pgd_entry)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pud) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + allocd_pgd = true; + } + + pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); + + ret = populate_pud(cpa, addr, pgd_entry, pgprot); + if (ret < 0) + return ret; + + cpa->numpages = ret; + return 0; +} + static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { -- cgit v1.1 From 4b23538d88c87d9c693ad87c8c808e92a505a6e6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:02 +0100 Subject: x86/mm/pageattr: Add a PUD pagetable populating function Add the next level of the pagetable populating function, we handle chunks around a 1G boundary by mapping them with the lower level functions - otherwise we use 1G pages for the mappings, thus using as less amount of pagetable pages as possible. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4b47ae0..81deca7 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,7 +666,92 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } -#define populate_pud(cpa, addr, pgd, pgprot) (-1) +static int alloc_pmd_page(pud_t *pud) +{ + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pmd) + return -1; + + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + return 0; +} + +#define populate_pmd(cpa, start, end, pages, pud, pgprot) (-1) + +static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) +{ + pud_t *pud; + unsigned long end; + int cur_pages = 0; + + end = start + (cpa->numpages << PAGE_SHIFT); + + /* + * Not on a Gb page boundary? => map everything up to it with + * smaller pages. + */ + if (start & (PUD_SIZE - 1)) { + unsigned long pre_end; + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + + pre_end = min_t(unsigned long, end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(int, (int)cpa->numpages, cur_pages); + + pud = pud_offset(pgd, start); + + /* + * Need a PMD page? + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, + pud, pgprot); + if (cur_pages < 0) + return cur_pages; + + start = pre_end; + } + + /* We mapped them all? */ + if (cpa->numpages == cur_pages) + return cur_pages; + + pud = pud_offset(pgd, start); + + /* + * Map everything starting from the Gb boundary, possibly with 1G pages + */ + while (end - start >= PUD_SIZE) { + set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + + start += PUD_SIZE; + cpa->pfn += PUD_SIZE; + cur_pages += PUD_SIZE >> PAGE_SHIFT; + pud++; + } + + /* Map trailing leftover */ + if (start < end) { + int tmp; + + pud = pud_offset(pgd, start); + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, + pud, pgprot); + if (tmp < 0) + return cur_pages; + + cur_pages += tmp; + } + return cur_pages; +} /* * Restrictions for kernel page table do not necessarily apply when mapping in -- cgit v1.1 From f900a4b8ab0f462d89a9fcb6173cac1403415b16 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:03 +0100 Subject: x86/mm/pageattr: Add a PMD pagetable populating function Handle PMD-level mappings the same as PUD ones. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 81deca7..968398b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,6 +666,16 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } +static int alloc_pte_page(pmd_t *pmd) +{ + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pte) + return -1; + + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + return 0; +} + static int alloc_pmd_page(pud_t *pud) { pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); @@ -676,7 +686,77 @@ static int alloc_pmd_page(pud_t *pud) return 0; } -#define populate_pmd(cpa, start, end, pages, pud, pgprot) (-1) +#define populate_pte(cpa, start, end, pages, pmd, pgprot) do {} while (0) + +static int populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) +{ + unsigned int cur_pages = 0; + pmd_t *pmd; + + /* + * Not on a 2M boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long pre_end = start + (num_pages << PAGE_SHIFT); + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + + pre_end = min_t(unsigned long, pre_end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(unsigned int, num_pages, cur_pages); + + /* + * Need a PTE page? + */ + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); + + start = pre_end; + } + + /* + * We mapped them all? + */ + if (num_pages == cur_pages) + return cur_pages; + + while (end - start >= PMD_SIZE) { + + /* + * We cannot use a 1G page so allocate a PMD page if needed. + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + pmd = pmd_offset(pud, start); + + set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + + start += PMD_SIZE; + cpa->pfn += PMD_SIZE; + cur_pages += PMD_SIZE >> PAGE_SHIFT; + } + + /* + * Map trailing 4K pages. + */ + if (start < end) { + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, end, num_pages - cur_pages, + pmd, pgprot); + } + return num_pages; +} static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, pgprot_t pgprot) -- cgit v1.1 From c6b6f363f7b24aa448994e3a65c4d5b3116acfcc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:04 +0100 Subject: x86/mm/pageattr: Add a PTE pagetable populating function Handle last level by unconditionally writing the PTEs into the PTE page while paying attention to the NX bit. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 968398b..2a1308a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -686,7 +686,27 @@ static int alloc_pmd_page(pud_t *pud) return 0; } -#define populate_pte(cpa, start, end, pages, pmd, pgprot) do {} while (0) +static void populate_pte(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, start); + + while (num_pages-- && start < end) { + + /* deal with the NX bit */ + if (!(pgprot_val(pgprot) & _PAGE_NX)) + cpa->pfn &= ~_PAGE_NX; + + set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); + + start += PAGE_SIZE; + cpa->pfn += PAGE_SIZE; + pte++; + } +} static int populate_pmd(struct cpa_data *cpa, unsigned long start, unsigned long end, -- cgit v1.1 From 0bb8aeee7b73b21e09d3ea12f2120d974f70b669 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:05 +0100 Subject: x86/mm/pageattr: Add a PUD error unwinding path In case we encounter an error during the mapping of a region, we want to unwind what we've established so far exactly the way we did the mapping. This is the PUD part kept deliberately small for easier review. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2a1308a..1cbdbbc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,6 +666,51 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } +#define unmap_pmd_range(pud, start, pre_end) do {} while (0) + +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +{ + pud_t *pud = pud_offset(pgd, start); + + /* + * Not on a GB page boundary? + */ + if (start & (PUD_SIZE - 1)) { + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + unmap_pmd_range(pud, start, pre_end); + + start = pre_end; + pud++; + } + + /* + * Try to unmap in 1G chunks? + */ + while (end - start >= PUD_SIZE) { + + if (pud_large(*pud)) + pud_clear(pud); + else + unmap_pmd_range(pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; + } + + /* + * 2M leftovers? + */ + if (start < end) + unmap_pmd_range(pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in + * populate_pgd's error path + */ +} + static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); @@ -883,9 +928,20 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); ret = populate_pud(cpa, addr, pgd_entry, pgprot); - if (ret < 0) - return ret; + if (ret < 0) { + unmap_pud_range(pgd_entry, addr, + addr + (cpa->numpages << PAGE_SHIFT)); + if (allocd_pgd) { + /* + * If I allocated this PUD page, I can just as well + * free it in this error path. + */ + pgd_clear(pgd_entry); + free_page((unsigned long)pud); + } + return ret; + } cpa->numpages = ret; return 0; } -- cgit v1.1 From 52a628fb454d13d944bb3c8a89a314cc3affa417 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:06 +0100 Subject: x86/mm/pageattr: Add last levels of error path We try to free the pagetable pages once we've unmapped our portion. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1cbdbbc..db8ace2 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,7 +666,99 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } -#define unmap_pmd_range(pud, start, pre_end) do {} while (0) +static bool try_to_free_pte_page(pte_t *pte) +{ + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; + + free_page((unsigned long)pte); + return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; + + free_page((unsigned long)pmd); + return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, start); + + while (start < end) { + set_pte(pte, __pte(0)); + + start += PAGE_SIZE; + pte++; + } + + if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + if (unmap_pte_range(pmd, start, end)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, start); + + /* + * Not on a 2MB page boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + __unmap_pmd_range(pud, pmd, start, pre_end); + + start = pre_end; + pmd++; + } + + /* + * Try to unmap in 2M chunks. + */ + while (end - start >= PMD_SIZE) { + if (pmd_large(*pmd)) + pmd_clear(pmd); + else + __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; + } + + /* + * 4K leftovers? + */ + if (start < end) + return __unmap_pmd_range(pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) { -- cgit v1.1 From 82f0712ca0f947170e785300b5c39d9c25e2f6ff Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:07 +0100 Subject: x86/mm/cpa: Map in an arbitrary pgd Add the ability to map pages in an arbitrary pgd. This wires in the remaining stuff so that there's a new interface with which you can map a region into an arbitrary PGD. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 53 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index db8ace2..b3b19f4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -453,7 +453,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * Check for races, another CPU might have split this page * up already: */ - tmp = lookup_address(address, &level); + tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) goto out_unlock; @@ -559,7 +559,8 @@ out_unlock: } static int -__split_large_page(pte_t *kpte, unsigned long address, struct page *base) +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, + struct page *base) { pte_t *pbase = (pte_t *)page_address(base); unsigned long pfn, pfninc = 1; @@ -572,7 +573,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base) * Check for races, another CPU might have split this page * up for us already: */ - tmp = lookup_address(address, &level); + tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; @@ -648,7 +649,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base) return 0; } -static int split_large_page(pte_t *kpte, unsigned long address) +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + unsigned long address) { struct page *base; @@ -660,7 +662,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (!base) return -ENOMEM; - if (__split_large_page(kpte, address, base)) + if (__split_large_page(cpa, kpte, address, base)) __free_page(base); return 0; @@ -1041,6 +1043,9 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { + if (cpa->pgd) + return populate_pgd(cpa, vaddr); + /* * Ignore all non primary paths. */ @@ -1085,7 +1090,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) else address = *cpa->vaddr; repeat: - kpte = lookup_address(address, &level); + kpte = _lookup_address_cpa(cpa, address, &level); if (!kpte) return __cpa_process_fault(cpa, address, primary); @@ -1149,7 +1154,7 @@ repeat: /* * We have to split the large page: */ - err = split_large_page(kpte, address); + err = split_large_page(cpa, kpte, address); if (!err) { /* * Do a global flush tlb after splitting the large page @@ -1298,6 +1303,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, int ret, cache, checkalias; unsigned long baddr = 0; + memset(&cpa, 0, sizeof(cpa)); + /* * Check, if we are requested to change a not supported * feature: @@ -1744,6 +1751,7 @@ static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), @@ -1762,6 +1770,7 @@ static int __set_pages_np(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), @@ -1822,6 +1831,36 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ +int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags) +{ + int retval = -EINVAL; + + struct cpa_data cpa = { + .vaddr = &address, + .pfn = pfn, + .pgd = pgd, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(0), + .flags = 0, + }; + + if (!(__supported_pte_mask & _PAGE_NX)) + goto out; + + if (!(page_flags & _PAGE_NX)) + cpa.mask_clr = __pgprot(_PAGE_NX); + + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); + + retval = __change_page_attr_set_clr(&cpa, 0); + __flush_tlb_all(); + +out: + return retval; +} + /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. -- cgit v1.1 From d2f7cbe7b26a74dbbbf8f325b2a6fd01bc34032c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:08 +0100 Subject: x86/efi: Runtime services virtual mapping We map the EFI regions needed for runtime services non-contiguously, with preserved alignment on virtual addresses starting from -4G down for a total max space of 64G. This way, we provide for stable runtime services addresses across kernels so that a kexec'd kernel can still use them. Thus, they're mapped in a separate pagetable so that we don't pollute the kernel namespace. Add an efi= kernel command line parameter for passing miscellaneous options and chicken bits from the command line. While at it, add a chicken bit called "efi=old_map" which can be used as a fallback to the old runtime services mapping method in case there's some b0rkage with a particular EFI implementation (haha, it is hard to hold up the sarcasm here...). Also, add the UEFI RT VA space to Documentation/x86/x86_64/mm.txt. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 64 ++++++++++++++------ arch/x86/include/asm/pgtable_types.h | 3 +- arch/x86/platform/efi/efi.c | 94 +++++++++++++++++++++--------- arch/x86/platform/efi/efi_32.c | 9 ++- arch/x86/platform/efi/efi_64.c | 109 +++++++++++++++++++++++++++++++++++ arch/x86/platform/efi/efi_stub_64.S | 54 +++++++++++++++++ 6 files changed, 286 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 65c6e6e..89a05b0 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,6 +1,24 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +/* + * We map the EFI regions needed for runtime services non-contiguously, + * with preserved alignment on virtual addresses starting from -4G down + * for a total max space of 64G. This way, we provide for stable runtime + * services addresses across kernels so that a kexec'd kernel can still + * use them. + * + * This is the main reason why we're doing stable VA mappings for RT + * services. + * + * This flag is used in conjuction with a chicken bit called + * "efi=old_map" which can be used as a fallback to the old runtime + * services mapping method in case there's some b0rkage with a + * particular EFI implementation (haha, it is hard to hold up the + * sarcasm here...). + */ +#define EFI_OLD_MEMMAP EFI_ARCH_1 + #ifdef CONFIG_X86_32 #define EFI_LOADER_SIGNATURE "EL32" @@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ (u64)(a4), (u64)(a5), (u64)(a6)) +#define _efi_call_virtX(x, f, ...) \ +({ \ + efi_status_t __s; \ + \ + efi_sync_low_kernel_mappings(); \ + preempt_disable(); \ + __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \ + preempt_enable(); \ + __s; \ +}) + #define efi_call_virt0(f) \ - efi_call0((efi.systab->runtime->f)) -#define efi_call_virt1(f, a1) \ - efi_call1((efi.systab->runtime->f), (u64)(a1)) -#define efi_call_virt2(f, a1, a2) \ - efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2)) -#define efi_call_virt3(f, a1, a2, a3) \ - efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3)) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4)) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5)) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) + _efi_call_virtX(0, f) +#define efi_call_virt1(f, a1) \ + _efi_call_virtX(1, f, (u64)(a1)) +#define efi_call_virt2(f, a1, a2) \ + _efi_call_virtX(2, f, (u64)(a1), (u64)(a2)) +#define efi_call_virt3(f, a1, a2, a3) \ + _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3)) +#define efi_call_virt4(f, a1, a2, a3, a4) \ + _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4)) +#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ + _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5)) +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ + _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); @@ -95,12 +120,17 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, extern int add_efi_memmap; extern unsigned long x86_efi_facility; +extern struct efi_scratch efi_scratch; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); +extern void __init efi_map_region(efi_memory_desc_t *md); +extern void efi_sync_low_kernel_mappings(void); +extern void efi_setup_page_tables(void); +extern void __init old_map_region(efi_memory_desc_t *md); #ifdef CONFIG_EFI diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f4843e0..028e28b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -379,7 +379,8 @@ static inline void update_page_count(int level, unsigned long pages) { } */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern phys_addr_t slow_virt_to_phys(void *__address); - +extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f396163..b453069 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -12,6 +12,8 @@ * Bibo Mao * Chandramouli Narayanan * Huang Ying + * Copyright (C) 2013 SuSE Labs + * Borislav Petkov - runtime services VA mapping * * Copied from efi_32.c to eliminate the duplicated code between EFI * 32/64 support code. --ying 2007-10-26 @@ -745,21 +747,56 @@ void efi_memory_uc(u64 addr, unsigned long size) set_memory_uc(addr, npages); } +void __init old_map_region(efi_memory_desc_t *md) +{ + u64 start_pfn, end_pfn, end; + unsigned long size; + void *va; + + start_pfn = PFN_DOWN(md->phys_addr); + size = md->num_pages << PAGE_SHIFT; + end = md->phys_addr + size; + end_pfn = PFN_UP(end); + + if (pfn_range_is_mapped(start_pfn, end_pfn)) { + va = __va(md->phys_addr); + + if (!(md->attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)va, size); + } else + va = efi_ioremap(md->phys_addr, size, + md->type, md->attribute); + + md->virt_addr = (u64) (unsigned long) va; + if (!va) + pr_err("ioremap of 0x%llX failed!\n", + (unsigned long long)md->phys_addr); +} + /* * This function will switch the EFI runtime services to virtual mode. - * Essentially, look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor and update - * that memory descriptor with the virtual address obtained from ioremap(). - * This enables the runtime services to be called without having to + * Essentially, we look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor into the + * ->trampoline_pgd page table using a top-down VA allocation scheme. + * + * The old method which used to update that memory descriptor with the + * virtual address obtained from ioremap() is still supported when the + * kernel is booted with efi=old_map on its command line. Same old + * method enabled the runtime services to be called without having to * thunk back into physical mode for every invocation. + * + * The new method does a pagetable switch in a preemption-safe manner + * so that we're in a different address space when calling a runtime + * function. For function arguments passing we do copy the PGDs of the + * kernel page table into ->trampoline_pgd prior to each call. */ void __init efi_enter_virtual_mode(void) { efi_memory_desc_t *md, *prev_md = NULL; - efi_status_t status; + void *p, *new_memmap = NULL; unsigned long size; - u64 end, systab, start_pfn, end_pfn; - void *p, *va, *new_memmap = NULL; + efi_status_t status; + u64 end, systab; int count = 0; efi.systab = NULL; @@ -768,7 +805,6 @@ void __init efi_enter_virtual_mode(void) * We don't do virtual mode, since we don't do runtime services, on * non-native EFI */ - if (!efi_is_native()) { efi_unmap_memmap(); return; @@ -799,6 +835,7 @@ void __init efi_enter_virtual_mode(void) continue; } prev_md = md; + } for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -808,33 +845,18 @@ void __init efi_enter_virtual_mode(void) md->type != EFI_BOOT_SERVICES_DATA) continue; + efi_map_region(md); + size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - start_pfn = PFN_DOWN(md->phys_addr); - end_pfn = PFN_UP(end); - if (pfn_range_is_mapped(start_pfn, end_pfn)) { - va = __va(md->phys_addr); - - if (!(md->attribute & EFI_MEMORY_WB)) - efi_memory_uc((u64)(unsigned long)va, size); - } else - va = efi_ioremap(md->phys_addr, size, - md->type, md->attribute); - - md->virt_addr = (u64) (unsigned long) va; - - if (!va) { - pr_err("ioremap of 0x%llX failed!\n", - (unsigned long long)md->phys_addr); - continue; - } - systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; + efi.systab = (efi_system_table_t *) (unsigned long) systab; } + new_memmap = krealloc(new_memmap, (count + 1) * memmap.desc_size, GFP_KERNEL); @@ -845,6 +867,9 @@ void __init efi_enter_virtual_mode(void) BUG_ON(!efi.systab); + efi_setup_page_tables(); + efi_sync_low_kernel_mappings(); + status = phys_efi_set_virtual_address_map( memmap.desc_size * count, memmap.desc_size, @@ -877,7 +902,8 @@ void __init efi_enter_virtual_mode(void) efi.query_variable_info = virt_efi_query_variable_info; efi.update_capsule = virt_efi_update_capsule; efi.query_capsule_caps = virt_efi_query_capsule_caps; - if (__supported_pte_mask & _PAGE_NX) + + if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX)) runtime_code_page_mkexec(); kfree(new_memmap); @@ -1007,3 +1033,15 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) return EFI_SUCCESS; } EXPORT_SYMBOL_GPL(efi_query_variable_store); + +static int __init parse_efi_cmdline(char *str) +{ + if (*str == '=') + str++; + + if (!strncmp(str, "old_map", 7)) + set_bit(EFI_OLD_MEMMAP, &x86_efi_facility); + + return 0; +} +early_param("efi", parse_efi_cmdline); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 40e4469..e94557c 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -37,9 +37,16 @@ * claim EFI runtime service handler exclusively and to duplicate a memory in * low memory space say 0 - 3G. */ - static unsigned long efi_rt_eflags; +void efi_sync_low_kernel_mappings(void) {} +void efi_setup_page_tables(void) {} + +void __init efi_map_region(efi_memory_desc_t *md) +{ + old_map_region(md); +} + void efi_call_phys_prelog(void) { struct desc_ptr gdt_descr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 39a0e7f..bf286c3 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -38,10 +38,28 @@ #include #include #include +#include static pgd_t *save_pgd __initdata; static unsigned long efi_flags __initdata; +/* + * We allocate runtime services regions bottom-up, starting from -4G, i.e. + * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. + */ +static u64 efi_va = -4 * (1UL << 30); +#define EFI_VA_END (-68 * (1UL << 30)) + +/* + * Scratch space used for switching the pagetable in the EFI stub + */ +struct efi_scratch { + u64 r15; + u64 prev_cr3; + pgd_t *efi_pgt; + bool use_pgd; +}; + static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; @@ -65,6 +83,9 @@ void __init efi_call_phys_prelog(void) int pgd; int n_pgds; + if (!efi_enabled(EFI_OLD_MEMMAP)) + return; + early_code_mapping_set_exec(1); local_irq_save(efi_flags); @@ -86,6 +107,10 @@ void __init efi_call_phys_epilog(void) */ int pgd; int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); + + if (!efi_enabled(EFI_OLD_MEMMAP)) + return; + for (pgd = 0; pgd < n_pgds; pgd++) set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); kfree(save_pgd); @@ -94,6 +119,90 @@ void __init efi_call_phys_epilog(void) early_code_mapping_set_exec(0); } +/* + * Add low kernel mappings for passing arguments to EFI functions. + */ +void efi_sync_low_kernel_mappings(void) +{ + unsigned num_pgds; + pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); + + if (efi_enabled(EFI_OLD_MEMMAP)) + return; + + num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET); + + memcpy(pgd + pgd_index(PAGE_OFFSET), + init_mm.pgd + pgd_index(PAGE_OFFSET), + sizeof(pgd_t) * num_pgds); +} + +void efi_setup_page_tables(void) +{ + efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd; + + if (!efi_enabled(EFI_OLD_MEMMAP)) + efi_scratch.use_pgd = true; +} + +static void __init __map_region(efi_memory_desc_t *md, u64 va) +{ + pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); + unsigned long pf = 0, size; + u64 end; + + if (!(md->attribute & EFI_MEMORY_WB)) + pf |= _PAGE_PCD; + + size = md->num_pages << PAGE_SHIFT; + end = va + size; + + if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf)) + pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, va); +} + +void __init efi_map_region(efi_memory_desc_t *md) +{ + unsigned long size = md->num_pages << PAGE_SHIFT; + u64 pa = md->phys_addr; + + if (efi_enabled(EFI_OLD_MEMMAP)) + return old_map_region(md); + + /* + * Make sure the 1:1 mappings are present as a catch-all for b0rked + * firmware which doesn't update all internal pointers after switching + * to virtual mode and would otherwise crap on us. + */ + __map_region(md, md->phys_addr); + + efi_va -= size; + + /* Is PA 2M-aligned? */ + if (!(pa & (PMD_SIZE - 1))) { + efi_va &= PMD_MASK; + } else { + u64 pa_offset = pa & (PMD_SIZE - 1); + u64 prev_va = efi_va; + + /* get us the same offset within this 2M page */ + efi_va = (efi_va & PMD_MASK) + pa_offset; + + if (efi_va > prev_va) + efi_va -= PMD_SIZE; + } + + if (efi_va < EFI_VA_END) { + pr_warn(FW_WARN "VA address range overflow!\n"); + return; + } + + /* Do the VA map */ + __map_region(md, efi_va); + md->virt_addr = efi_va; +} + void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, u32 type, u64 attribute) { diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 4c07cca..88073b1 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -34,10 +34,47 @@ mov %rsi, %cr0; \ mov (%rsp), %rsp + /* stolen from gcc */ + .macro FLUSH_TLB_ALL + movq %r15, efi_scratch(%rip) + movq %r14, efi_scratch+8(%rip) + movq %cr4, %r15 + movq %r15, %r14 + andb $0x7f, %r14b + movq %r14, %cr4 + movq %r15, %cr4 + movq efi_scratch+8(%rip), %r14 + movq efi_scratch(%rip), %r15 + .endm + + .macro SWITCH_PGT + cmpb $0, efi_scratch+24(%rip) + je 1f + movq %r15, efi_scratch(%rip) # r15 + # save previous CR3 + movq %cr3, %r15 + movq %r15, efi_scratch+8(%rip) # prev_cr3 + movq efi_scratch+16(%rip), %r15 # EFI pgt + movq %r15, %cr3 + 1: + .endm + + .macro RESTORE_PGT + cmpb $0, efi_scratch+24(%rip) + je 2f + movq efi_scratch+8(%rip), %r15 + movq %r15, %cr3 + movq efi_scratch(%rip), %r15 + FLUSH_TLB_ALL + 2: + .endm + ENTRY(efi_call0) SAVE_XMM subq $32, %rsp + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -47,7 +84,9 @@ ENTRY(efi_call1) SAVE_XMM subq $32, %rsp mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -57,7 +96,9 @@ ENTRY(efi_call2) SAVE_XMM subq $32, %rsp mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -68,7 +109,9 @@ ENTRY(efi_call3) subq $32, %rsp mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -80,7 +123,9 @@ ENTRY(efi_call4) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -93,7 +138,9 @@ ENTRY(efi_call5) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $48, %rsp RESTORE_XMM ret @@ -109,8 +156,15 @@ ENTRY(efi_call6) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $48, %rsp RESTORE_XMM ret ENDPROC(efi_call6) + + .data +ENTRY(efi_scratch) + .fill 3,8,0 + .byte 0 -- cgit v1.1 From ee41143027706d9f342dfe05487a00b20887fde7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:09 +0100 Subject: x86/efi: Check krealloc return value Check it just in case. We might just as well panic there because runtime won't be functioning anyway. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index b453069..3fac4de 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -860,6 +860,9 @@ void __init efi_enter_virtual_mode(void) new_memmap = krealloc(new_memmap, (count + 1) * memmap.desc_size, GFP_KERNEL); + if (!new_memmap) + goto err_out; + memcpy(new_memmap + (count * memmap.desc_size), md, memmap.desc_size); count++; @@ -914,6 +917,11 @@ void __init efi_enter_virtual_mode(void) EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL); + + return; + + err_out: + pr_err("Error reallocating memory, EFI runtime non-functional!\n"); } /* -- cgit v1.1 From a653f3563c51c7bb7de63d607bef09d3baddaeb8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 11 Nov 2013 14:28:39 -0800 Subject: x86, kaslr: Mix entropy sources together as needed Depending on availability, mix the RDRAND and RDTSC entropy together with XOR. Only when neither is available should the i8254 be used. Update the Kconfig documentation to reflect this. Additionally, since bits used for entropy is masked elsewhere, drop the needless masking in the get_random_long(). Similarly, use the entire TSC, not just the low 32 bits. Finally, to improve the starting entropy, do a simple hashing of a build-time versions string and the boot-time boot_params structure for some additional level of unpredictability. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20131111222839.GA28616@www.outflux.net Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 14 +++++--- arch/x86/boot/compressed/aslr.c | 73 +++++++++++++++++++++++++++++++---------- 2 files changed, 65 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 51f4399..596cd9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1735,13 +1735,17 @@ config RANDOMIZE_BASE deters exploit attempts relying on knowledge of the location of kernel internals. - Entropy is generated using the RDRAND instruction if it - is supported. If not, then RDTSC is used, if supported. If - neither RDRAND nor RDTSC are supported, then no randomness - is introduced. + Entropy is generated using the RDRAND instruction if it is + supported. If RDTSC is supported, it is used as well. If + neither RDRAND nor RDTSC are supported, then randomness is + read from the i8254 timer. The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, - and aligned according to PHYSICAL_ALIGN. + and aligned according to PHYSICAL_ALIGN. Since the kernel is + built using 2GiB addressing, and PHYSICAL_ALGIN must be at a + minimum of 2MiB, only 10 bits of entropy is theoretically + possible. At best, due to page table layouts, 64-bit can use + 9 bits of entropy and 32-bit uses 8 bits. config RANDOMIZE_BASE_MAX_OFFSET hex "Maximum ASLR offset allowed" diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 0595798..8746487 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -5,6 +5,17 @@ #include #include +#include +#include +#include +#include +#include +#include + +/* Simplified build-specific string for starting entropy. */ +static const char *build_str = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + #define I8254_PORT_CONTROL 0x43 #define I8254_PORT_COUNTER0 0x40 #define I8254_CMD_READBACK 0xC0 @@ -25,34 +36,62 @@ static inline u16 i8254(void) return timer; } +static unsigned long rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + unsigned long *ptr = (unsigned long *)area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple but unpredictable starting entropy. */ +static unsigned long get_random_boot(void) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); + + return hash; +} + static unsigned long get_random_long(void) { - unsigned long random; + unsigned long raw, random = get_random_boot(); + bool use_i8254 = true; + + debug_putstr("KASLR using"); if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr("KASLR using RDRAND...\n"); - if (rdrand_long(&random)) - return random; + debug_putstr(" RDRAND"); + if (rdrand_long(&raw)) { + random ^= raw; + use_i8254 = false; + } } if (has_cpuflag(X86_FEATURE_TSC)) { - uint32_t raw; + debug_putstr(" RDTSC"); + rdtscll(raw); - debug_putstr("KASLR using RDTSC...\n"); - rdtscl(raw); + random ^= raw; + use_i8254 = false; + } - /* Only use the low bits of rdtsc. */ - random = raw & 0xffff; - } else { - debug_putstr("KASLR using i8254...\n"); - random = i8254(); + if (use_i8254) { + debug_putstr(" i8254"); + random ^= i8254(); } - /* Extend timer bits poorly... */ - random |= (random << 16); -#ifdef CONFIG_X86_64 - random |= (random << 32); -#endif + debug_putstr("...\n"); + return random; } -- cgit v1.1 From e8236c4d9338d52d0f2fcecc0b792ac0542e4ee9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 Nov 2013 22:45:20 -0800 Subject: x86, kaslr: Add a circular multiply for better bit diffusion If we don't have RDRAND (in which case nothing else *should* matter), most sources have a highly biased entropy distribution. Use a circular multiply to diffuse the entropic bits. A circular multiply is a good operation for this: it is cheap on standard hardware and because it is symmetric (unlike an ordinary multiply) it doesn't introduce its own bias. Cc: Kees Cook Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/20131111222839.GA28616@www.outflux.net --- arch/x86/boot/compressed/aslr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 8746487..38a07cc 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -64,6 +64,11 @@ static unsigned long get_random_boot(void) static unsigned long get_random_long(void) { +#ifdef CONFIG_X86_64 + const unsigned long mix_const = 0x5d6008cbf3848dd3UL; +#else + const unsigned long mix_const = 0x3f39e593UL; +#endif unsigned long raw, random = get_random_boot(); bool use_i8254 = true; @@ -90,6 +95,12 @@ static unsigned long get_random_long(void) random ^= i8254(); } + /* Circular multiply for better bit diffusion */ + asm("mul %3" + : "=a" (random), "=d" (raw) + : "a" (random), "rm" (mix_const)); + random += raw; + debug_putstr("...\n"); return random; -- cgit v1.1 From 327f7d72454aecdc7a4a1c847a291a3f224b730f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Nov 2013 08:56:07 -0800 Subject: x86, kaslr: Use char array to gain sizeof sanity The build_str needs to be char [] not char * for the sizeof() to report the string length. Reported-by: Mathias Krause Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20131112165607.GA5921@www.outflux.net Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/aslr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 38a07cc..84be175 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -13,7 +13,7 @@ #include /* Simplified build-specific string for starting entropy. */ -static const char *build_str = UTS_RELEASE " (" LINUX_COMPILE_BY "@" +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; #define I8254_PORT_CONTROL 0x43 -- cgit v1.1 From f4cb1cc18f364d761d5614eb6293cccc6647f259 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 16 Nov 2013 12:37:01 -0800 Subject: x86-64, copy_user: Remove zero byte check before copy user buffer. Operation of rep movsb instruction handles zero byte copy. As pointed out by Linus, there is no need to check zero size in kernel. Removing this redundant check saves a few cycles in copy user functions. Reported-by: Linus Torvalds Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1384634221-6006-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/copy_user_64.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index a30ca15..ffe4eb9 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -236,8 +236,6 @@ ENDPROC(copy_user_generic_unrolled) ENTRY(copy_user_generic_string) CFI_STARTPROC ASM_STAC - andl %edx,%edx - jz 4f cmpl $8,%edx jb 2f /* less than 8 bytes, go to byte copy loop */ ALIGN_DESTINATION @@ -249,7 +247,7 @@ ENTRY(copy_user_generic_string) 2: movl %edx,%ecx 3: rep movsb -4: xorl %eax,%eax + xorl %eax,%eax ASM_CLAC ret @@ -279,12 +277,10 @@ ENDPROC(copy_user_generic_string) ENTRY(copy_user_enhanced_fast_string) CFI_STARTPROC ASM_STAC - andl %edx,%edx - jz 2f movl %edx,%ecx 1: rep movsb -2: xorl %eax,%eax + xorl %eax,%eax ASM_CLAC ret -- cgit v1.1 From 5305ca10e7f44333e46c1ce57fb87306cb437891 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Nov 2013 14:14:00 -0800 Subject: x86/mm: Unify pte_to_pgoff() and pgoff_to_pte() helpers Use unified pte_bitop() helper to manipulate bits in pte/pgoff bitfields and convert pte_to_pgoff()/pgoff_to_pte() to inlines. Signed-off-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Cc: Pavel Emelyanov Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-2level.h | 100 ++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 3bf2dd0..0d193e2 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -55,6 +55,13 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +/* Bit manipulation helper on pte/pgoff entry */ +static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift, + unsigned long mask, unsigned int leftshift) +{ + return ((value >> rightshift) & mask) << leftshift; +} + #ifdef CONFIG_MEM_SOFT_DIRTY /* @@ -71,31 +78,34 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ - & ((1U << PTE_FILE_BITS1) - 1))) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << (PTE_FILE_BITS1)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << PTE_FILE_SHIFT3) \ - + ((((off) >> \ - (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ - << PTE_FILE_SHIFT4) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) +#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) +#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) + +static __always_inline pgoff_t pte_to_pgoff(pte_t pte) +{ + return (pgoff_t) + (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4)); +} + +static __always_inline pte_t pgoff_to_pte(pgoff_t off) +{ + return (pte_t){ + .pte_low = + pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + + pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + + pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + + pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) + + _PAGE_FILE, + }; +} #else /* CONFIG_MEM_SOFT_DIRTY */ @@ -115,22 +125,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> PTE_FILE_SHIFT1) \ - & ((1U << PTE_FILE_BITS1) - 1)) \ - + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ - & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ - + (((pte).pte_low >> PTE_FILE_SHIFT3) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - << PTE_FILE_SHIFT3) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) + +static __always_inline pgoff_t pte_to_pgoff(pte_t pte) +{ + return (pgoff_t) + (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3)); +} + +static __always_inline pte_t pgoff_to_pte(pgoff_t off) +{ + return (pte_t){ + .pte_low = + pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + + pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + + pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) + + _PAGE_FILE, + }; +} #endif /* CONFIG_MEM_SOFT_DIRTY */ -- cgit v1.1 From fd8526ad14c182605e42b64646344b95befd9f94 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 19 Nov 2013 15:17:50 +0200 Subject: x86/mm: Implement ASLR for hugetlb mappings Matthew noticed that hugetlb mappings don't participate in ASLR on x86-64: % for i in `seq 3`; do > tools/testing/selftests/vm/map_hugetlb | grep address > done Returned address is 0x2aaaaac00000 Returned address is 0x2aaaaac00000 Returned address is 0x2aaaaac00000 /proc/PID/maps entries for the mapping are always the same (except inode number): 2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 8200 /anon_hugepage (deleted) 2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 256 /anon_hugepage (deleted) 2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 7180 /anon_hugepage (deleted) The reason is the generic hugetlb_get_unmapped_area() function which is used on x86-64. It doesn't support randomization and use bottom-up unmapped area lookup, instead of usual top-down on x86-64. x86 has arch-specific hugetlb_get_unmapped_area(), but it's used only on x86-32. Let's use arch-specific hugetlb_get_unmapped_area() on x86-64 too. That adds ASLR and switches hugetlb mappings to use top-down unmapped area lookup: % for i in `seq 3`; do > tools/testing/selftests/vm/map_hugetlb | grep address > done Returned address is 0x7f4f08a00000 Returned address is 0x7fdda4200000 Returned address is 0x7febe0000000 /proc/PID/maps entries: 7f4f08a00000-7f4f18a00000 rw-p 00000000 00:0c 1168 /anon_hugepage (deleted) 7fdda4200000-7fddb4200000 rw-p 00000000 00:0c 7092 /anon_hugepage (deleted) 7febe0000000-7febf0000000 rw-p 00000000 00:0c 7183 /anon_hugepage (deleted) Unmapped area lookup policy for hugetlb mappings is consistent with normal mappings now -- the only difference is alignment requirements for huge pages. libhugetlbfs test-suite didn't detect any regressions with the patch applied (although it shows few failures on my machine regardless the patch). Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Cc: Matthew Wilcox Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Naoya Horiguchi Cc: Linus Torvalds Cc: Andrea Arcangeli Cc: Peter Zijlstra Cc: Mel Gorman Link: http://lkml.kernel.org/r/20131119131750.EA45CE0090@blue.fi.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page.h | 1 + arch/x86/include/asm/page_32.h | 4 ---- arch/x86/mm/hugetlbpage.c | 9 +++------ 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index c878924..775873d 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -71,6 +71,7 @@ extern bool __virt_addr_valid(unsigned long kaddr); #include #define __HAVE_ARCH_GATE_AREA 1 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */ diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 4d550d0..904f528 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -5,10 +5,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_HUGETLB_PAGE -#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA -#endif - #define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 9d980d8..8c9f647 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -87,9 +87,7 @@ int pmd_huge_support(void) } #endif -/* x86_64 also uses this file */ - -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -99,7 +97,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; + info.low_limit = current->mm->mmap_legacy_base; info.high_limit = TASK_SIZE; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -172,8 +170,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); } - -#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 static __init int setup_hugepagesz(char *opt) -- cgit v1.1 From 661c80192d21269c7fc566f1d547510b0c867677 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 20 Nov 2013 12:50:51 -0800 Subject: x86-64, copy_user: Use leal to produce 32-bit results When we are using lea to produce a 32-bit result, we can use the leal form, rather than using leaq and worry about truncation elsewhere. Make the leal explicit, both to be more obvious and since that is what gcc generates and thus is less likely to trigger obscure gas bugs. Cc: Fenghua Yu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1384634221-6006-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/copy_user_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index ffe4eb9..dee945d 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -186,7 +186,7 @@ ENTRY(copy_user_generic_unrolled) 30: shll $6,%ecx addl %ecx,%edx jmp 60f -40: lea (%rdx,%rcx,8),%rdx +40: leal (%rdx,%rcx,8),%edx jmp 60f 50: movl %ecx,%edx 60: jmp copy_user_handle_tail /* ecx is zerorest also */ @@ -252,7 +252,7 @@ ENTRY(copy_user_generic_string) ret .section .fixup,"ax" -11: lea (%rdx,%rcx,8),%rcx +11: leal (%rdx,%rcx,8),%ecx 12: movl %ecx,%edx /* ecx is zerorest also */ jmp copy_user_handle_tail .previous -- cgit v1.1 From 4788e5b4b2338f85fa42a712a182d8afd65d7c58 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 12 Nov 2013 17:58:50 +0100 Subject: perf/x86: Add Intel RAPL PMU support This patch adds a new uncore PMU to expose the Intel RAPL energy consumption counters. Up to 3 counters, each counting a particular RAPL event are exposed. The RAPL counters are available on Intel SandyBridge, IvyBridge, Haswell. The server skus add a 3rd counter. The following events are available and exposed in sysfs: - power/energy-cores: power consumption of all cores on socket - power/energy-pkg: power consumption of all cores + LLc cache - power/energy-dram: power consumption of DRAM (servers only) For each event both the unit (Joules) and scale (2^-32 J) is exposed in sysfs for use by perf stat and other tools. The files are: /sys/devices/power/events/energy-*.unit /sys/devices/power/events/energy-*.scale The RAPL PMU is uncore by nature and is implemented such that it only works in system-wide mode. Measuring only one CPU per socket is sufficient. The /sys/devices/power/cpumask file can be used by tools to figure out which CPUs to monitor by default. For instance, on a 2-socket system, 2 CPUs (one on each socket) will be shown. All the counters measure in the same unit (exposed via sysfs). The perf_events API exposes all RAPL counters as 64-bit integers counting in unit of 1/2^32 Joules (about 0.23 nJ). User level tools must convert the counts by multiplying them by 2^-32 to obtain Joules. The reason for this is that the kernel avoids doing floating point math whenever possible because it is expensive (user floating-point state must be saved). The method used avoids kernel floating-point usage. There is no loss of precision. Thanks to PeterZ for suggesting this approach. To convert the raw count in Watt: W = C * 2.3 / (1e10 * time) or ldexp(C, -32). RAPL PMU is a new standalone PMU which registers with the perf_event core subsystem. The PMU type (attr->type) is dynamically allocated and is available from /sys/device/power/type. Sampling is not supported by the RAPL PMU. There is no privilege level filtering either. Signed-off-by: Stephane Eranian Reviewed-by: Maria Dimakopoulou Reviewed-by: Andi Kleen Signed-off-by: Peter Zijlstra Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1384275531-10892-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 591 ++++++++++++++++++++++++++++ 2 files changed, 592 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_rapl.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7..6359506 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o endif diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 0000000..cfcd386 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,591 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + * pp0 counter: consumption of all physical cores (power plane 0) + * event: rapl_energy_cores + * perf code: 0x1 + * + * pkg counter: consumption of the whole processor package + * event: rapl_energy_pkg + * perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + * event: rapl_energy_dram + * perf code: 0x3 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include +#include +#include +#include +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ +#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ +#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ +#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN (1<config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK 0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ + .config = _config, \ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { + spinlock_t lock; + int hw_unit; /* 1/2^hw_unit Joule */ + int n_active; /* number of active events */ + struct list_head active_list; + struct pmu *pmu; /* pointer to rapl_pmu_class */ +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ + u64 raw; + rdmsrl(event->hw.event_base, raw); + return raw; +} + +static inline u64 rapl_scale(u64 v) +{ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + int shift = RAPL_CNTR_WIDTH; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdmsrl(event->hw.event_base, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + cpu_relax(); + goto again; + } + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + sdelta = rapl_scale(delta); + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + + pmu->n_active++; +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + __rapl_pmu_event_start(pmu, event); + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(pmu->n_active <= 0); + pmu->n_active--; + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(pmu, event); + + spin_unlock_irqrestore(&pmu->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, msr, ret = 0; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmu_class.type) + return -ENOENT; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) + return -EINVAL; + + /* + * check event is known (determines counter) + */ + switch (cfg) { + case INTEL_RAPL_PP0: + bit = RAPL_IDX_PP0_NRG_STAT; + msr = MSR_PP0_ENERGY_STATUS; + break; + case INTEL_RAPL_PKG: + bit = RAPL_IDX_PKG_NRG_STAT; + msr = MSR_PKG_ENERGY_STATUS; + break; + case INTEL_RAPL_RAM: + bit = RAPL_IDX_RAM_NRG_STAT; + msr = MSR_DRAM_ENERGY_STATUS; + break; + default: + return -EINVAL; + } + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* must be done before validate_group */ + event->hw.event_base = msr; + event->hw.config = cfg; + event->hw.idx = bit; + + return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + +static struct pmu rapl_pmu_class = { + .attr_groups = rapl_attr_groups, + .task_ctx_nr = perf_invalid_context, /* system-wide only */ + .event_init = rapl_pmu_event_init, + .add = rapl_pmu_event_add, /* must have */ + .del = rapl_pmu_event_del, /* must have */ + .start = rapl_pmu_event_start, + .stop = rapl_pmu_event_stop, + .read = rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int i, phys_id = topology_physical_package_id(cpu); + int target = -1; + + /* find a new cpu on same package */ + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + /* + * clear cpu from cpumask + * if was set in cpumask and still some cpu on package, + * then move to new cpu + */ + if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &rapl_cpu_mask); + + WARN_ON(cpumask_empty(&rapl_cpu_mask)); + /* + * migrate events and context to new cpu + */ + if (target >= 0) + perf_pmu_migrate_context(pmu->pmu, cpu, target); +} + +static void rapl_cpu_init(int cpu) +{ + int i, phys_id = topology_physical_package_id(cpu); + + /* check if phys_is is already covered */ + for_each_cpu(i, &rapl_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + /* was not found, so add it */ + cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int phys_id = topology_physical_package_id(cpu); + + if (pmu) + return 0; + + if (phys_id < 0) + return -1; + + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -1; + + spin_lock_init(&pmu->lock); + + INIT_LIST_HEAD(&pmu->active_list); + + /* + * grab power unit as: 1/2^unit Joules + * + * we cache in local PMU instance + */ + rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); + pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->pmu = &rapl_pmu_class; + + /* set RAPL pmu for this cpu for now */ + per_cpu(rapl_pmu, cpu) = pmu; + per_cpu(rapl_pmu_to_free, cpu) = NULL; + + return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + + kfree(pmu); + + per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + + if (!pmu) + return 0; + + per_cpu(rapl_pmu, cpu) = NULL; + + per_cpu(rapl_pmu_to_free, cpu) = pmu; + + return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + rapl_cpu_prepare(cpu); + break; + case CPU_STARTING: + rapl_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + rapl_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + rapl_cpu_kfree(cpu); + break; + case CPU_DOWN_PREPARE: + rapl_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { + [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, + [1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ + struct rapl_pmu *pmu; + int cpu, ret; + + /* + * check for Intel processor family 6 + */ + if (!x86_match_cpu(rapl_cpu_match)) + return 0; + + /* check supported CPU */ + switch (boot_cpu_data.x86_model) { + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + case 60: /* Haswell */ + rapl_cntr_mask = RAPL_IDX_CLN; + rapl_pmu_events_group.attrs = rapl_events_cln_attr; + break; + case 45: /* Sandy Bridge-EP */ + case 62: /* IvyTown */ + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; + + default: + /* unsupported */ + return 0; + } + get_online_cpus(); + + for_each_online_cpu(cpu) { + rapl_cpu_prepare(cpu); + rapl_cpu_init(cpu); + } + + perf_cpu_notifier(rapl_cpu_notifier); + + ret = perf_pmu_register(&rapl_pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); + put_online_cpus(); + return -1; + } + + pmu = __get_cpu_var(rapl_pmu); + + pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + " API unit is 2^-32 Joules," + " %d fixed counters\n", + pmu->hw_unit, + hweight32(rapl_cntr_mask)); + + put_online_cpus(); + + return 0; +} +device_initcall(rapl_pmu_init); -- cgit v1.1 From 65661f96d3b32f4b28fef26d21be81d7e173b965 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 12 Nov 2013 17:58:51 +0100 Subject: perf/x86: Add RAPL hrtimer support The RAPL PMU counters do not interrupt on overflow. Therefore, the kernel needs to poll the counters to avoid missing an overflow. This patch adds the hrtimer code to do this. The timer interval is calculated at boot time based on the power unit used by the HW. There is one hrtimer per-cpu to handle the case of multiple simultaneous use across cores on the same package + hotplug CPU. Thanks to Maria Dimakopoulou for her contributions to this patch especially on the math aspects. Signed-off-by: Stephane Eranian Reviewed-by: Maria Dimakopoulou Reviewed-by: Andi Kleen [ Applied 32-bit build fix. ] Signed-off-by: Peter Zijlstra Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1384275531-10892-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 74 ++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index cfcd386..bf8e4a7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -96,6 +96,8 @@ struct rapl_pmu { int n_active; /* number of active events */ struct list_head active_list; struct pmu *pmu; /* pointer to rapl_pmu_class */ + ktime_t timer_interval; /* in ktime_t unit */ + struct hrtimer hrtimer; }; static struct pmu rapl_pmu_class; @@ -158,6 +160,48 @@ again: return new_raw_count; } +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ + __hrtimer_start_range_ns(&pmu->hrtimer, + pmu->timer_interval, 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ + hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct perf_event *event; + unsigned long flags; + + if (!pmu->n_active) + return HRTIMER_NORESTART; + + spin_lock_irqsave(&pmu->lock, flags); + + list_for_each_entry(event, &pmu->active_list, active_entry) { + rapl_event_update(event); + } + + spin_unlock_irqrestore(&pmu->lock, flags); + + hrtimer_forward_now(hrtimer, pmu->timer_interval); + + return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ + struct hrtimer *hr = &pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; +} + static void __rapl_pmu_event_start(struct rapl_pmu *pmu, struct perf_event *event) { @@ -171,6 +215,8 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, local64_set(&event->hw.prev_count, rapl_read_counter(event)); pmu->n_active++; + if (pmu->n_active == 1) + rapl_start_hrtimer(pmu); } static void rapl_pmu_event_start(struct perf_event *event, int mode) @@ -195,6 +241,8 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) if (!(hwc->state & PERF_HES_STOPPED)) { WARN_ON_ONCE(pmu->n_active <= 0); pmu->n_active--; + if (pmu->n_active == 0) + rapl_stop_hrtimer(pmu); list_del(&event->active_entry); @@ -423,6 +471,9 @@ static void rapl_cpu_exit(int cpu) */ if (target >= 0) perf_pmu_migrate_context(pmu->pmu, cpu, target); + + /* cancel overflow polling timer for CPU */ + rapl_stop_hrtimer(pmu); } static void rapl_cpu_init(int cpu) @@ -442,6 +493,7 @@ static int rapl_cpu_prepare(int cpu) { struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); + u64 ms; if (pmu) return 0; @@ -466,6 +518,22 @@ static int rapl_cpu_prepare(int cpu) pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; + /* + * use reference of 200W for scaling the timeout + * to avoid missing counter overflows. + * 200W = 200 Joules/sec + * divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + if (pmu->hw_unit < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + else + ms = 2; + + pmu->timer_interval = ms_to_ktime(ms); + + rapl_hrtimer_init(pmu); + /* set RAPL pmu for this cpu for now */ per_cpu(rapl_pmu, cpu) = pmu; per_cpu(rapl_pmu_to_free, cpu) = NULL; @@ -580,9 +648,11 @@ static int __init rapl_pmu_init(void) pr_info("RAPL PMU detected, hw unit 2^-%d Joules," " API unit is 2^-32 Joules," - " %d fixed counters\n", + " %d fixed counters" + " %llu ms ovfl timer\n", pmu->hw_unit, - hweight32(rapl_cntr_mask)); + hweight32(rapl_cntr_mask), + ktime_to_ms(pmu->timer_interval)); put_online_cpus(); -- cgit v1.1 From 853d9b18f1e861d37e9b271742329f8c1176eabe Mon Sep 17 00:00:00 2001 From: Levente Kurusa Date: Fri, 29 Nov 2013 21:28:48 +0100 Subject: x86, mce: Call put_device on device_register failure This patch adds a call to put_device() when the device_register() call has failed. This is required so that the last reference to the device is given up. Signed-off-by: Levente Kurusa Link: http://lkml.kernel.org/r/5298F900.9000208@linux.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b3218cd..a389c1d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu) dev->release = &mce_device_release; err = device_register(dev); - if (err) + if (err) { + put_device(dev); return err; + } for (i = 0; mce_device_attrs[i]; i++) { err = device_create_file(dev, mce_device_attrs[i]); -- cgit v1.1 From 191f57c137bcce0e3e9313acb77b2f114d15afbb Mon Sep 17 00:00:00 2001 From: Qiaowei Ren Date: Sat, 7 Dec 2013 08:20:57 +0800 Subject: x86, cpufeature: Define the Intel MPX feature flag Define the Intel MPX (Memory Protection Extensions) CPU feature flag in the cpufeature list. Signed-off-by: Qiaowei Ren Link: http://lkml.kernel.org/r/1386375658-2191-2-git-send-email-qiaowei.ren@intel.com Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 89270b43..e099f95 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -216,6 +216,7 @@ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ -- cgit v1.1 From e7d820a5e549b3eb6c3f9467507566565646a669 Mon Sep 17 00:00:00 2001 From: Qiaowei Ren Date: Thu, 5 Dec 2013 17:15:34 +0800 Subject: x86, xsave: Support eager-only xsave features, add MPX support Some features, like Intel MPX, work only if the kernel uses eagerfpu model. So we should force eagerfpu on unless the user has explicitly disabled it. Add definitions for Intel MPX and add it to the supported list. [ hpa: renamed XSTATE_FLEXIBLE to XSTATE_LAZY and added comments ] Signed-off-by: Qiaowei Ren Link: http://lkml.kernel.org/r/9E0BE1322F2F2246BD820DA9FC397ADE014A6115@SHSMSX102.ccr.corp.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 23 +++++++++++++++++++++++ arch/x86/include/asm/xsave.h | 14 ++++++++++---- arch/x86/kernel/xsave.c | 10 ++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4..b7845a1 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -370,6 +370,26 @@ struct ymmh_struct { u32 ymmh_space[64]; }; +struct lwp_struct { + u64 lwpcb_addr; + u32 flags; + u32 buf_head_offset; + u64 buf_base; + u32 buf_size; + u32 filters; + u64 saved_event_record[4]; + u32 event_counter[16]; +}; + +struct bndregs_struct { + u64 bndregs[8]; +} __packed; + +struct bndcsr_struct { + u64 cfg_reg_u; + u64 status_reg; +} __packed; + struct xsave_hdr_struct { u64 xstate_bv; u64 reserved1[2]; @@ -380,6 +400,9 @@ struct xsave_struct { struct i387_fxsave_struct i387; struct xsave_hdr_struct xsave_hdr; struct ymmh_struct ymmh; + struct lwp_struct lwp; + struct bndregs_struct bndregs; + struct bndcsr_struct bndcsr; /* new processor state extensions will go here */ } __attribute__ ((packed, aligned (64))); diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 0415cda..5547389 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -9,6 +9,8 @@ #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 +#define XSTATE_BNDREGS 0x8 +#define XSTATE_BNDCSR 0x10 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) @@ -20,10 +22,14 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) -/* - * These are the features that the OS can handle currently. - */ -#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +/* Supported features which support lazy state saving */ +#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) + +/* Supported features which require eager state saving */ +#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) + +/* All currently supported features */ +#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 422fd82..a4b451c 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void) if (cpu_has_xsaveopt && eagerfpu != DISABLE) eagerfpu = ENABLE; + if (pcntxt_mask & XSTATE_EAGER) { + if (eagerfpu == DISABLE) { + pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", + pcntxt_mask & XSTATE_EAGER); + pcntxt_mask &= ~XSTATE_EAGER; + } else { + eagerfpu = ENABLE; + } + } + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", pcntxt_mask, xstate_size); } -- cgit v1.1 From d8af4ce490e92adfa123ae79899332bd829903e7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Dec 2013 14:46:42 +0100 Subject: x86/traps: Clean up error exception handler definitions So I was reading the exception handler generation code and got a real headache looking at the unstructured mess that our DO_ERROR*() generation code is today. Make it more readable. Cc: Thomas Gleixner Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/n/tip-kuabysiykvUJpgus35lhnhvs@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b857ed8..57409f6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ exception_exit(prev_state); \ } -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, - regs->ip) -DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, - regs->ip) -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", - coprocessor_segment_overrun) -DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) +DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) +DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow ) +DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds ) +DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip ) +DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun ) +DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS ) +DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present ) #ifdef CONFIG_X86_32 -DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) +DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment ) #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, - BUS_ADRALN, 0) +DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 ) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -- cgit v1.1 From f09174c501f8bb259788cc36d5a7aa5b2831fb5e Mon Sep 17 00:00:00 2001 From: Qiaowei Ren Date: Sat, 14 Dec 2013 14:25:02 +0800 Subject: x86: add user_atomic_cmpxchg_inatomic at uaccess.h This patch adds user_atomic_cmpxchg_inatomic() to use CMPXCHG instruction against a user space address. This generalizes the already existing futex_atomic_cmpxchg_inatomic() so it can be used in other contexts. This will be used in the upcoming support for Intel MPX (Memory Protection Extensions.) [ hpa: replaced #ifdef inside a macro with IS_ENABLED() ] Signed-off-by: Qiaowei Ren Link: http://lkml.kernel.org/r/1387002303-6620-1-git-send-email-qiaowei.ren@intel.com Signed-off-by: H. Peter Anvin Cc: Peter Zijlstra --- arch/x86/include/asm/uaccess.h | 92 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8ec57c0..48ff838 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -525,6 +525,98 @@ extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); +extern void __cmpxchg_wrong_size(void) + __compiletime_error("Bad argument size for cmpxchg"); + +#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \ +({ \ + int __ret = 0; \ + __typeof__(ptr) __uval = (uval); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case 1: \ + { \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ + "2:\t" ASM_CLAC "\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ + : "i" (-EFAULT), "q" (__new), "1" (__old) \ + : "memory" \ + ); \ + break; \ + } \ + case 2: \ + { \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ + "2:\t" ASM_CLAC "\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ + : "i" (-EFAULT), "r" (__new), "1" (__old) \ + : "memory" \ + ); \ + break; \ + } \ + case 4: \ + { \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ + "2:\t" ASM_CLAC "\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ + : "i" (-EFAULT), "r" (__new), "1" (__old) \ + : "memory" \ + ); \ + break; \ + } \ + case 8: \ + { \ + if (!IS_ENABLED(CONFIG_X86_64)) \ + __cmpxchg_wrong_size(); \ + \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ + "2:\t" ASM_CLAC "\n" \ + "\t.section .fixup, \"ax\"\n" \ + "3:\tmov %3, %0\n" \ + "\tjmp 2b\n" \ + "\t.previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \ + : "i" (-EFAULT), "r" (__new), "1" (__old) \ + : "memory" \ + ); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + *__uval = __old; \ + __ret; \ +}) + +#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new) \ +({ \ + access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ? \ + __user_atomic_cmpxchg_inatomic((uval), (ptr), \ + (old), (new), sizeof(*(ptr))) : \ + -EFAULT; \ +}) + /* * movsl can be slow when source and dest are not both 8-byte aligned */ -- cgit v1.1 From 0ee3b6f87d4d748d5362cb47ff33fa1553805cb4 Mon Sep 17 00:00:00 2001 From: Qiaowei Ren Date: Sat, 14 Dec 2013 14:25:03 +0800 Subject: x86: replace futex_atomic_cmpxchg_inatomic() with user_atomic_cmpxchg_inatomic futex_atomic_cmpxchg_inatomic() is simply the 32-bit implementation of user_atomic_cmpxchg_inatomic(), which in turn is simply a generalization of the original code in futex_atomic_cmpxchg_inatomic(). Use the newly generalized user_atomic_cmpxchg_inatomic() as the futex implementation, too. [ hpa: retain the inline in futex.h rather than changing it to a macro ] Signed-off-by: Qiaowei Ren Link: http://lkml.kernel.org/r/1387002303-6620-2-git-send-email-qiaowei.ren@intel.com Signed-off-by: H. Peter Anvin Cc: Peter Zijlstra --- arch/x86/include/asm/futex.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index be27ba1..b4c1f54 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -110,26 +110,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { - int ret = 0; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - - asm volatile("\t" ASM_STAC "\n" - "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" - "2:\t" ASM_CLAC "\n" - "\t.section .fixup, \"ax\"\n" - "3:\tmov %3, %0\n" - "\tjmp 2b\n" - "\t.previous\n" - _ASM_EXTABLE(1b, 3b) - : "+r" (ret), "=a" (oldval), "+m" (*uaddr) - : "i" (-EFAULT), "r" (newval), "1" (oldval) - : "memory" - ); - - *uval = oldval; - return ret; + return user_atomic_cmpxchg_inatomic(uval, uaddr, oldval, newval); } #endif -- cgit v1.1 From 7fd565e27547c913b83b46d94662103be81a88ec Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 16 Dec 2013 21:20:23 +0100 Subject: perf/x86: enable Haswell Celeron RAPL support Enable RAPL support for Haswell Celeron (model 69). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: vincent.weaver@maine.edu Cc: maria.n.dimakopoulou@gmail.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1387225224-27799-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index bf8e4a7..0e3754e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -615,6 +615,7 @@ static int __init rapl_pmu_init(void) case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; -- cgit v1.1 From 7306006f103567fcf595d23dd741da0b8642f4c5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 18 Dec 2013 15:52:13 -0800 Subject: x86, realmode: Pointer walk cleanups, pull out invariant use of __pa() The pointer arithmetic in this function was really bizarre, where in fact all we really wanted was a simple pointer array walk. Use the much more idiomatic construction for that (*ptr++). Factor an invariant use of __pa() out of the relocation loop. At least on 64 bits it seems gcc isn't capable of doing that automatically. Change the scope of a couple of variables to make it extra obvious that they are extremely local temp variables. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-rd908t9c8kvcojdabtmm94mb@git.kernel.org --- arch/x86/realmode/init.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index a44f457..bad628a 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -29,12 +29,10 @@ void __init reserve_real_mode(void) void __init setup_real_mode(void) { u16 real_mode_seg; - u32 *rel; + const u32 *rel; u32 count; - u32 *ptr; - u16 *seg; - int i; unsigned char *base; + unsigned long phys_base; struct trampoline_header *trampoline_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); #ifdef CONFIG_X86_64 @@ -46,23 +44,23 @@ void __init setup_real_mode(void) memcpy(base, real_mode_blob, size); - real_mode_seg = __pa(base) >> 4; + phys_base = __pa(base); + real_mode_seg = phys_base >> 4; + rel = (u32 *) real_mode_relocs; /* 16-bit segment relocations. */ - count = rel[0]; - rel = &rel[1]; - for (i = 0; i < count; i++) { - seg = (u16 *) (base + rel[i]); + count = *rel++; + while (count--) { + u16 *seg = (u16 *) (base + *rel++); *seg = real_mode_seg; } /* 32-bit linear relocations. */ - count = rel[i]; - rel = &rel[i + 1]; - for (i = 0; i < count; i++) { - ptr = (u32 *) (base + rel[i]); - *ptr += __pa(base); + count = *rel++; + while (count--) { + u32 *ptr = (u32 *) (base + *rel++); + *ptr += phys_base; } /* Must be perfomed *after* relocation. */ -- cgit v1.1 From f3d815cb854b2f6262ade56a4d91a1ed3f1e50c4 Mon Sep 17 00:00:00 2001 From: Lans Zhang Date: Fri, 6 Dec 2013 12:18:30 +0800 Subject: x86/mm/numa: Fix 32-bit kernel NUMA boot When booting a 32-bit x86 kernel on a NUMA machine, node data cannot be allocated from local node if the account of memory for node 0 covers the low memory space entirely: [ 0.000000] Initmem setup node 0 [mem 0x00000000-0x83fffffff] [ 0.000000] NODE_DATA [mem 0x367ed000-0x367edfff] [ 0.000000] Initmem setup node 1 [mem 0x840000000-0xfffffffff] [ 0.000000] Cannot find 4096 bytes in node 1 [ 0.000000] 64664MB HIGHMEM available. [ 0.000000] 871MB LOWMEM available. To fix this issue, node data is allowed to be allocated from other nodes if the memory of local node is still not mapped. The expected result looks like this: [ 0.000000] Initmem setup node 0 [mem 0x00000000-0x83fffffff] [ 0.000000] NODE_DATA [mem 0x367ed000-0x367edfff] [ 0.000000] Initmem setup node 1 [mem 0x840000000-0xfffffffff] [ 0.000000] NODE_DATA [mem 0x367ec000-0x367ecfff] [ 0.000000] NODE_DATA(1) on node 0 [ 0.000000] 64664MB HIGHMEM available. [ 0.000000] 871MB LOWMEM available. Signed-off-by: Lans Zhang Cc: Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1386303510-18574-1-git-send-email-jia.zhang@windriver.com Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 24aec58..c85da7b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -211,9 +211,13 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); - return; + nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, + MEMBLOCK_ALLOC_ACCESSIBLE); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; + } } nd = __va(nd_pa); -- cgit v1.1 From 16824255394f55adf31b9a96a9965d8c15bdac4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Dec 2013 15:08:36 +0100 Subject: x86, acpi, idle: Restructure the mwait idle routines People seem to delight in writing wrong and broken mwait idle routines; collapse the lot. This leaves mwait_play_dead() the sole remaining user of __mwait() and new __mwait() users are probably doing it wrong. Also remove __sti_mwait() as its unused. Cc: Arjan van de Ven Cc: Jacob Jun Pan Cc: Mike Galbraith Cc: Len Brown Cc: Rui Zhang Acked-by: Rafael Wysocki Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131212141654.616820819@infradead.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mwait.h | 40 ++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 23 ----------------------- arch/x86/kernel/acpi/cstate.c | 23 ----------------------- 3 files changed, 40 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 2f366d0..361b02e 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MWAIT_H #define _ASM_X86_MWAIT_H +#include + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 @@ -13,4 +15,42 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __mwait(eax, ecx); + } + __current_clr_polling(); +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4..24821f5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -700,29 +700,6 @@ static inline void sync_core(void) #endif } -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) -{ - trace_hardirqs_on(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27..e69182f 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); -- cgit v1.1 From 7e98b71920464b8d15fa95c74366416cd3c88861 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Dec 2013 11:58:16 -0800 Subject: x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers Use static_cpu_has() to conditionalize the CLFLUSH workaround, and add memory barriers around it since the documentation is explicit that CLFLUSH is only ordered with respect to MFENCE. Signed-off-by: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Len Brown Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com --- arch/x86/include/asm/mwait.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 361b02e..19b71c4 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -43,8 +43,11 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { + mb(); clflush((void *)¤t_thread_info()->flags); + mb(); + } __monitor((void *)¤t_thread_info()->flags, 0, 0); if (!need_resched()) -- cgit v1.1 From 7d590cca7cd2cce4ed7c47d221d6f90566653ba8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Dec 2013 12:30:03 -0800 Subject: x86, idle: Add memory barriers around clflush in mwait_play_dead() For consistency with mwait_idle_with_hints(). Not sure they help, but they really won't hurt... Signed-off-by: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Len Brown Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com --- arch/x86/kernel/smpboot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85dc05a..f5252c4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void) * The WBINVD is insufficient due to the spurious-wakeup * case where we return around the loop. */ + mb(); clflush(mwait_ptr); + mb(); __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); -- cgit v1.1 From 19952a92037e752f9d3bbbad552d596f9a56e146 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Dec 2013 11:35:58 -0800 Subject: stackprotector: Unify the HAVE_CC_STACKPROTECTOR logic between architectures Instead of duplicating the CC_STACKPROTECTOR Kconfig and Makefile logic in each architecture, switch to using HAVE_CC_STACKPROTECTOR and keep everything in one place. This retains the x86-specific bug verification scripts. Signed-off-by: Kees Cook Cc: Arjan van de Ven Cc: Michal Marek Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: James Hogan Cc: Stephen Rothwell Cc: Shawn Guo Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mips@linux-mips.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/1387481759-14535-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 17 +---------------- arch/x86/Makefile | 8 +++----- 2 files changed, 4 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0952ecd..838e7c3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -125,6 +125,7 @@ config X86 select RTC_LIB select HAVE_DEBUG_STACKOVERFLOW select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 + select HAVE_CC_STACKPROTECTOR config INSTRUCTION_DECODER def_bool y @@ -1617,22 +1618,6 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. -config CC_STACKPROTECTOR - bool "Enable -fstack-protector buffer overflow detection" - ---help--- - This option turns on the -fstack-protector GCC feature. This - feature puts, at the beginning of functions, a canary value on - the stack just before the return address, and validates - the value just before actually returning. Stack based buffer - overflows (that need to overwrite this return address) now also - overwrite the canary, which gets detected and the attack is then - neutralized via a kernel panic. - - This feature requires gcc version 4.2 or above, or a distribution - gcc with the feature backported. Older versions are automatically - detected and for those versions, this configuration option is - ignored. (and a warning is printed during bootup) - source kernel/Kconfig.hz config KEXEC diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 57d0215..13b22e0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -89,13 +89,11 @@ else KBUILD_CFLAGS += -maccumulate-outgoing-args endif +# Make sure compiler does not have buggy stack-protector support. ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh - ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) - stackp-y := -fstack-protector - KBUILD_CFLAGS += $(stackp-y) - else - $(warning stack protector enabled but no compiler support) + ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) + $(warning stack-protector enabled but compiler support broken) endif endif -- cgit v1.1 From b70fedc15892de8bc78f711d7821dd0916cc5508 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 16 Dec 2013 16:09:51 -0800 Subject: x86, x32: Use __kernel_long_t/__kernel_ulong_t in x86-64 stat.h Both x32 and x86-64 use the same stat system call interface. But x32 long is 32-bit. This patch changes x86 uapi to use __kernel_long_t/__kernel_ulong_t in x86-64 stat. Signed-off-by: H.J. Lu Link: http://lkml.kernel.org/r/CAMe9rOquPtWEro0GQ=Z95pZJ=c7GGkSHynjN4FbiB4p445x-Ng@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/uapi/asm/stat.h | 42 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/stat.h b/arch/x86/include/uapi/asm/stat.h index 7b3ddc3..bc03eb5 100644 --- a/arch/x86/include/uapi/asm/stat.h +++ b/arch/x86/include/uapi/asm/stat.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_STAT_H #define _ASM_X86_STAT_H +#include + #define STAT_HAVE_NSEC 1 #ifdef __i386__ @@ -78,26 +80,26 @@ struct stat64 { #else /* __i386__ */ struct stat { - unsigned long st_dev; - unsigned long st_ino; - unsigned long st_nlink; - - unsigned int st_mode; - unsigned int st_uid; - unsigned int st_gid; - unsigned int __pad0; - unsigned long st_rdev; - long st_size; - long st_blksize; - long st_blocks; /* Number 512-byte blocks allocated. */ - - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - long __unused[3]; + __kernel_ulong_t st_dev; + __kernel_ulong_t st_ino; + __kernel_ulong_t st_nlink; + + unsigned int st_mode; + unsigned int st_uid; + unsigned int st_gid; + unsigned int __pad0; + __kernel_ulong_t st_rdev; + __kernel_long_t st_size; + __kernel_long_t st_blksize; + __kernel_long_t st_blocks; /* Number 512-byte blocks allocated. */ + + __kernel_ulong_t st_atime; + __kernel_ulong_t st_atime_nsec; + __kernel_ulong_t st_mtime; + __kernel_ulong_t st_mtime_nsec; + __kernel_ulong_t st_ctime; + __kernel_ulong_t st_ctime_nsec; + __kernel_long_t __unused[3]; }; /* We don't need to memset the whole thing just to initialize the padding */ -- cgit v1.1 From addccbb264e5e0e5762f4893f6df24afad327c8c Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Mon, 25 Nov 2013 02:15:00 -0500 Subject: ACPI, APEI, GHES: Do not report only correctable errors with SCI Currently SCI is employed to handle corrected errors - memory corrected errors, more specifically but in fact SCI still can be used to handle any errors, e.g. uncorrected or even fatal ones if enabled by the BIOS. Enable logging for those kinds of errors too. Signed-off-by: Chen, Gong Acked-by: Naveen N. Rao Cc: Tony Luck Link: http://lkml.kernel.org/r/1385363701-12387-1-git-send-email-gong.chen@linux.intel.com [ Boris: massage commit message, rename function arg. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index de8b60a..a1aef95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -33,22 +33,28 @@ #include #include #include +#include #include #include "mce-internal.h" -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; - /* Only corrected MC is reported */ - if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA)) + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; mce_setup(&m); m.bank = 1; - /* Fake a memory read corrected error with unknown channel */ + /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + if (severity >= GHES_SEV_PANIC) + m.status |= MCI_STATUS_PCC; + m.addr = mem_err->physical_addr; mce_log(&m); mce_notify_irq(); -- cgit v1.1 From 2da6e57cce14a1c3b0692d6f877b72e185110e2e Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:13 +0800 Subject: x86/efi: Remove unused variables in __map_region() variables size and end is useless in this function, thus remove them. Reported-by: Toshi Kani Tested-by: Toshi Kani Signed-off-by: Dave Young Acked-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi_64.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index bf286c3..c5a6491 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -148,15 +148,11 @@ void efi_setup_page_tables(void) static void __init __map_region(efi_memory_desc_t *md, u64 va) { pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); - unsigned long pf = 0, size; - u64 end; + unsigned long pf = 0; if (!(md->attribute & EFI_MEMORY_WB)) pf |= _PAGE_PCD; - size = md->num_pages << PAGE_SHIFT; - end = va + size; - if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf)) pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", md->phys_addr, va); -- cgit v1.1 From 3b2664964bc886ae9d5127c8d3708b1acc0626d2 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:14 +0800 Subject: x86/efi: Add a wrapper function efi_map_region_fixed() Kexec kernel will use saved runtime virtual mapping, so add a new function efi_map_region_fixed() for directly mapping a md to md->virt. The md is passed in from 1st kernel, the virtual addr is saved in md->virt_addr. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + arch/x86/platform/efi/efi_32.c | 2 ++ arch/x86/platform/efi/efi_64.c | 10 ++++++++++ 3 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 89a05b0..9fbaeb2 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -128,6 +128,7 @@ extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); +extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index e94557c..7b3ec6e 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -47,6 +47,8 @@ void __init efi_map_region(efi_memory_desc_t *md) old_map_region(md); } +void __init efi_map_region_fixed(efi_memory_desc_t *md) {} + void efi_call_phys_prelog(void) { struct desc_ptr gdt_descr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index c5a6491..ff08cb1 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -199,6 +199,16 @@ void __init efi_map_region(efi_memory_desc_t *md) md->virt_addr = efi_va; } +/* + * kexec kernel will use efi_map_region_fixed to map efi runtime memory ranges. + * md->virt_addr is the original virtual address which had been mapped in kexec + * 1st kernel. + */ +void __init efi_map_region_fixed(efi_memory_desc_t *md) +{ + __map_region(md, md->virt_addr); +} + void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, u32 type, u64 attribute) { -- cgit v1.1 From a7f84f03f660d93574ac88835d056c0d6468aebe Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:15 +0800 Subject: x86/efi: Fix off-by-one bug in EFI Boot Services reservation Current code check boot service region with kernel text region by: start+size >= __pa_symbol(_text) The end of the above region should be start + size - 1 instead. I see this problem in ovmf + Fedora 19 grub boot: text start: 1000000 md start: 800000 md size: 800000 Signed-off-by: Dave Young Acked-by: Borislav Petkov Acked-by: Toshi Kani Tested-by: Toshi Kani Cc: stable@vger.kernel.org Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f8ec4da..15e3b5e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -438,7 +438,7 @@ void __init efi_reserve_boot_services(void) * - Not within any part of the kernel * - Not the bios reserved area */ - if ((start+size >= __pa_symbol(_text) + if ((start + size > __pa_symbol(_text) && start <= __pa_symbol(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || memblock_is_region_reserved(start, size)) { -- cgit v1.1 From 481f75c043cf44ec11c7fbdbbf37d43463f1e719 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:16 +0800 Subject: x86/efi: Cleanup efi_enter_virtual_mode() function Add two small functions: efi_merge_regions() and efi_map_regions(), efi_enter_virtual_mode() calls them instead of embedding two long for loop. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 132 ++++++++++++++++++++++++++------------------ 1 file changed, 79 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 15e3b5e..4694632 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -773,44 +773,12 @@ void __init old_map_region(efi_memory_desc_t *md) (unsigned long long)md->phys_addr); } -/* - * This function will switch the EFI runtime services to virtual mode. - * Essentially, we look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor into the - * ->trampoline_pgd page table using a top-down VA allocation scheme. - * - * The old method which used to update that memory descriptor with the - * virtual address obtained from ioremap() is still supported when the - * kernel is booted with efi=old_map on its command line. Same old - * method enabled the runtime services to be called without having to - * thunk back into physical mode for every invocation. - * - * The new method does a pagetable switch in a preemption-safe manner - * so that we're in a different address space when calling a runtime - * function. For function arguments passing we do copy the PGDs of the - * kernel page table into ->trampoline_pgd prior to each call. - */ -void __init efi_enter_virtual_mode(void) +/* Merge contiguous regions of the same type and attribute */ +static void __init efi_merge_regions(void) { + void *p; efi_memory_desc_t *md, *prev_md = NULL; - void *p, *new_memmap = NULL; - unsigned long size; - efi_status_t status; - u64 end, systab; - int count = 0; - - efi.systab = NULL; - /* - * We don't do virtual mode, since we don't do runtime services, on - * non-native EFI - */ - if (!efi_is_native()) { - efi_unmap_memmap(); - return; - } - - /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { u64 prev_size; md = p; @@ -835,8 +803,31 @@ void __init efi_enter_virtual_mode(void) continue; } prev_md = md; + } +} + +static void __init get_systab_virt_addr(efi_memory_desc_t *md) +{ + unsigned long size; + u64 end, systab; + size = md->num_pages << EFI_PAGE_SHIFT; + end = md->phys_addr + size; + systab = (u64)(unsigned long)efi_phys.systab; + if (md->phys_addr <= systab && systab < end) { + systab += md->virt_addr - md->phys_addr; + efi.systab = (efi_system_table_t *)(unsigned long)systab; } +} + +/* + * Map efi memory ranges for runtime serivce and update new_memmap with virtual + * addresses. + */ +static void * __init efi_map_regions(int *count) +{ + efi_memory_desc_t *md; + void *p, *tmp, *new_memmap = NULL; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; @@ -849,26 +840,64 @@ void __init efi_enter_virtual_mode(void) } efi_map_region(md); + get_systab_virt_addr(md); + + tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size, + GFP_KERNEL); + if (!tmp) + goto out_krealloc; + new_memmap = tmp; + memcpy(new_memmap + (*count * memmap.desc_size), md, + memmap.desc_size); + (*count)++; + } - size = md->num_pages << EFI_PAGE_SHIFT; - end = md->phys_addr + size; + return new_memmap; +out_krealloc: + kfree(new_memmap); + return NULL; +} + +/* + * This function will switch the EFI runtime services to virtual mode. + * Essentially, we look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor into the + * ->trampoline_pgd page table using a top-down VA allocation scheme. + * + * The old method which used to update that memory descriptor with the + * virtual address obtained from ioremap() is still supported when the + * kernel is booted with efi=old_map on its command line. Same old + * method enabled the runtime services to be called without having to + * thunk back into physical mode for every invocation. + * + * The new method does a pagetable switch in a preemption-safe manner + * so that we're in a different address space when calling a runtime + * function. For function arguments passing we do copy the PGDs of the + * kernel page table into ->trampoline_pgd prior to each call. + */ +void __init efi_enter_virtual_mode(void) +{ + efi_status_t status; + void *new_memmap = NULL; + int count = 0; - systab = (u64) (unsigned long) efi_phys.systab; - if (md->phys_addr <= systab && systab < end) { - systab += md->virt_addr - md->phys_addr; + efi.systab = NULL; - efi.systab = (efi_system_table_t *) (unsigned long) systab; - } + /* + * We don't do virtual mode, since we don't do runtime services, on + * non-native EFI + */ + if (!efi_is_native()) { + efi_unmap_memmap(); + return; + } - new_memmap = krealloc(new_memmap, - (count + 1) * memmap.desc_size, - GFP_KERNEL); - if (!new_memmap) - goto err_out; + efi_merge_regions(); - memcpy(new_memmap + (count * memmap.desc_size), md, - memmap.desc_size); - count++; + new_memmap = efi_map_regions(&count); + if (!new_memmap) { + pr_err("Error reallocating memory, EFI runtime non-functional!\n"); + return; } BUG_ON(!efi.systab); @@ -922,9 +951,6 @@ void __init efi_enter_virtual_mode(void) 0, NULL); return; - - err_out: - pr_err("Error reallocating memory, EFI runtime non-functional!\n"); } /* -- cgit v1.1 From a0998eb15afeffbf52a2c2829318f67df9ac57b8 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:17 +0800 Subject: efi: Export more EFI table variables to sysfs Export fw_vendor, runtime and config table physical addresses to /sys/firmware/efi/{fw_vendor,runtime,config_table} because kexec kernels need them. From EFI spec these 3 variables will be updated to virtual address after entering virtual mode. But kernel startup code will need the physical address. Signed-off-by: Dave Young Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 4694632..2859107 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -653,6 +653,10 @@ void __init efi_init(void) set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); + efi.config_table = (unsigned long)efi.systab->tables; + efi.fw_vendor = (unsigned long)efi.systab->fw_vendor; + efi.runtime = (unsigned long)efi.systab->runtime; + /* * Show what we know for posterity */ -- cgit v1.1 From 926172d46038d7610b6b8d84e40db727cefb482d Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:18 +0800 Subject: efi: Export EFI runtime memory mapping to sysfs kexec kernel will need exactly same mapping for EFI runtime memory ranges. Thus here export the runtime ranges mapping to sysfs, kexec-tools will assemble them and pass to 2nd kernel via setup_data. Introducing a new directory /sys/firmware/efi/runtime-map just like /sys/firmware/memmap. Containing below attribute in each file of that directory: attribute num_pages phys_addr type virt_addr Signed-off-by: Dave Young Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 2859107..74fe7a7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -76,6 +76,9 @@ static __initdata efi_config_table_type_t arch_tables[] = { {NULL_GUID, NULL, NULL}, }; +static void *efi_runtime_map; +static int nr_efi_runtime_map; + /* * Returns 1 if 'facility' is enabled, 0 otherwise. */ @@ -824,6 +827,39 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) } } +static int __init save_runtime_map(void) +{ + efi_memory_desc_t *md; + void *tmp, *p, *q = NULL; + int count = 0; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + + if (!(md->attribute & EFI_MEMORY_RUNTIME) || + (md->type == EFI_BOOT_SERVICES_CODE) || + (md->type == EFI_BOOT_SERVICES_DATA)) + continue; + tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL); + if (!tmp) + goto out; + q = tmp; + + memcpy(q + count * memmap.desc_size, md, memmap.desc_size); + count++; + } + + efi_runtime_map = q; + nr_efi_runtime_map = count; + efi_runtime_map_setup(efi_runtime_map, nr_efi_runtime_map, + boot_params.efi_info.efi_memdesc_size); + + return 0; +out: + kfree(q); + return -ENOMEM; +} + /* * Map efi memory ranges for runtime serivce and update new_memmap with virtual * addresses. @@ -849,7 +885,7 @@ static void * __init efi_map_regions(int *count) tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size, GFP_KERNEL); if (!tmp) - goto out_krealloc; + goto out; new_memmap = tmp; memcpy(new_memmap + (*count * memmap.desc_size), md, memmap.desc_size); @@ -857,7 +893,7 @@ static void * __init efi_map_regions(int *count) } return new_memmap; -out_krealloc: +out: kfree(new_memmap); return NULL; } @@ -883,7 +919,7 @@ void __init efi_enter_virtual_mode(void) { efi_status_t status; void *new_memmap = NULL; - int count = 0; + int err, count = 0; efi.systab = NULL; @@ -904,6 +940,10 @@ void __init efi_enter_virtual_mode(void) return; } + err = save_runtime_map(); + if (err) + pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n"); + BUG_ON(!efi.systab); efi_setup_page_tables(); -- cgit v1.1 From c5fe5d80680e2949ffe102180f5fc6cefc0d145f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 27 Dec 2013 15:30:58 -0800 Subject: x86: Replace assembly access_ok() with a C variant It turns out that the assembly variant doesn't actually produce that good code, presumably partly because it creates a long dependency chain with no scheduling, and partly because we cannot get a flags result out of gcc (which could be fixed with asm goto, but it turns out not to be worth it.) The C code allows gcc to schedule and generate multiple (easily predictable) branches, and as a side benefit we can really optimize the case where the size is constant. Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8ec57c0..84ecf1d 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -40,22 +40,28 @@ /* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. - * - * This is equivalent to the following test: - * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) - * - * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ +static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) +{ + /* + * If we have used "sizeof()" for the size, + * we know it won't overflow the limit (but + * it might overflow the 'addr', so it's + * important to subtract the size from the + * limit, not add it to the address). + */ + if (__builtin_constant_p(size)) + return addr > limit - size; + + /* Arbitrary sizes? Be careful about overflow */ + addr += size; + return (addr < size) || (addr > limit); +} #define __range_not_ok(addr, size, limit) \ ({ \ - unsigned long flag, roksum; \ __chk_user_ptr(addr); \ - asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ - : "=&r" (flag), "=r" (roksum) \ - : "1" (addr), "g" ((long)(size)), \ - "rm" (limit)); \ - flag; \ + __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) /** -- cgit v1.1 From a740576a4abf933de8f50787f24f24456cebd761 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 27 Dec 2013 16:52:47 -0800 Subject: x86: Slightly tweak the access_ok() C variant for better code gcc can under very specific circumstances realize that the code sequence: foo += bar; if (foo < bar) ... ... is equivalent to a carry out from the addition. Tweak the implementation of access_ok() (specifically __chk_range_not_ok()) to make it more likely that gcc will make that connection. It isn't fool-proof (sometimes gcc seems to think it can make better code with lea, and ends up with a second comparison), still, but it seems to be able to connect the two more frequently this way. Cc: Linus Torvalds Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 84ecf1d..6f1bb74 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -41,7 +41,7 @@ * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ -static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) +static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) { /* * If we have used "sizeof()" for the size, @@ -55,7 +55,9 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns /* Arbitrary sizes? Be careful about overflow */ addr += size; - return (addr < size) || (addr > limit); + if (addr < size) + return true; + return addr > limit; } #define __range_not_ok(addr, size, limit) \ @@ -84,7 +86,7 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns * this function, memory access functions may still return -EFAULT. */ #define access_ok(type, addr, size) \ - (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) + likely(!__range_not_ok(addr, size, user_addr_max())) /* * The exception table consists of pairs of addresses relative to the -- cgit v1.1 From 1fec0533693cd74f2d1a46edd29449cfee429df0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:19 +0800 Subject: x86/efi: Pass necessary EFI data for kexec via setup_data Add a new setup_data type SETUP_EFI for kexec use. Passing the saved fw_vendor, runtime, config tables and EFI runtime mappings. When entering virtual mode, directly mapping the EFI runtime regions which we passed in previously. And skip the step to call SetVirtualAddressMap(). Specially for HP z420 workstation we need save the smbios physical address. The kernel boot sequence proceeds in the following order. Step 2 requires efi.smbios to be the physical address. However, I found that on HP z420 EFI system table has a virtual address of SMBIOS in step 1. Hence, we need set it back to the physical address with the smbios in efi_setup_data. (When it is still the physical address, it simply sets the same value.) 1. efi_init() - Set efi.smbios from EFI system table 2. dmi_scan_machine() - Temporary map efi.smbios to access SMBIOS table 3. efi_enter_virtual_mode() - Map EFI ranges Tested on ovmf+qemu, lenovo thinkpad, a dell laptop and an HP z420 workstation. Signed-off-by: Dave Young Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 12 +++ arch/x86/include/uapi/asm/bootparam.h | 1 + arch/x86/kernel/setup.c | 3 + arch/x86/platform/efi/efi.c | 158 ++++++++++++++++++++++++++++------ arch/x86/platform/efi/efi_32.c | 1 + arch/x86/platform/efi/efi_64.c | 6 ++ 6 files changed, 156 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 9fbaeb2..4d1ba80 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -133,6 +133,18 @@ extern void efi_sync_low_kernel_mappings(void); extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); +struct efi_setup_data { + u64 fw_vendor; + u64 runtime; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +extern u64 efi_setup; +extern u32 efi_data_len; +extern void parse_efi_setup(u64 phys_addr, u32 data_len); + #ifdef CONFIG_EFI static inline bool efi_is_native(void) diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 9c3733c..64fe421 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -6,6 +6,7 @@ #define SETUP_E820_EXT 1 #define SETUP_DTB 2 #define SETUP_PCI 3 +#define SETUP_EFI 4 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cb233bc..24536f7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -447,6 +447,9 @@ static void __init parse_setup_data(void) case SETUP_DTB: add_dtb(pa_data); break; + case SETUP_EFI: + parse_efi_setup(pa_data, data_len); + break; default: break; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 74fe7a7..9965ff4 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -78,6 +78,8 @@ static __initdata efi_config_table_type_t arch_tables[] = { static void *efi_runtime_map; static int nr_efi_runtime_map; +u64 efi_setup; /* efi setup_data physical address */ +u32 efi_data_len; /* efi setup_data payload length */ /* * Returns 1 if 'facility' is enabled, 0 otherwise. @@ -115,7 +117,6 @@ static int __init setup_storage_paranoia(char *arg) } early_param("efi_no_storage_paranoia", setup_storage_paranoia); - static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { unsigned long flags; @@ -494,18 +495,27 @@ static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { efi_system_table_64_t *systab64; + struct efi_setup_data *data = NULL; u64 tmp = 0; + if (efi_setup) { + data = early_memremap(efi_setup, sizeof(*data)); + if (!data) + return -ENOMEM; + } systab64 = early_ioremap((unsigned long)phys, sizeof(*systab64)); if (systab64 == NULL) { pr_err("Couldn't map the system table!\n"); + if (data) + early_iounmap(data, sizeof(*data)); return -ENOMEM; } efi_systab.hdr = systab64->hdr; - efi_systab.fw_vendor = systab64->fw_vendor; - tmp |= systab64->fw_vendor; + efi_systab.fw_vendor = data ? (unsigned long)data->fw_vendor : + systab64->fw_vendor; + tmp |= data ? data->fw_vendor : systab64->fw_vendor; efi_systab.fw_revision = systab64->fw_revision; efi_systab.con_in_handle = systab64->con_in_handle; tmp |= systab64->con_in_handle; @@ -519,15 +529,20 @@ static int __init efi_systab_init(void *phys) tmp |= systab64->stderr_handle; efi_systab.stderr = systab64->stderr; tmp |= systab64->stderr; - efi_systab.runtime = (void *)(unsigned long)systab64->runtime; - tmp |= systab64->runtime; + efi_systab.runtime = data ? + (void *)(unsigned long)data->runtime : + (void *)(unsigned long)systab64->runtime; + tmp |= data ? data->runtime : systab64->runtime; efi_systab.boottime = (void *)(unsigned long)systab64->boottime; tmp |= systab64->boottime; efi_systab.nr_tables = systab64->nr_tables; - efi_systab.tables = systab64->tables; - tmp |= systab64->tables; + efi_systab.tables = data ? (unsigned long)data->tables : + systab64->tables; + tmp |= data ? data->tables : systab64->tables; early_iounmap(systab64, sizeof(*systab64)); + if (data) + early_iounmap(data, sizeof(*data)); #ifdef CONFIG_X86_32 if (tmp >> 32) { pr_err("EFI data located above 4GB, disabling EFI.\n"); @@ -631,6 +646,71 @@ static int __init efi_memmap_init(void) return 0; } +/* + * A number of config table entries get remapped to virtual addresses + * after entering EFI virtual mode. However, the kexec kernel requires + * their physical addresses therefore we pass them via setup_data and + * correct those entries to their respective physical addresses here. + * + * Currently only handles smbios which is necessary for some firmware + * implementation. + */ +static int __init efi_reuse_config(u64 tables, int nr_tables) +{ + int i, sz, ret = 0; + void *p, *tablep; + struct efi_setup_data *data; + + if (!efi_setup) + return 0; + + if (!efi_enabled(EFI_64BIT)) + return 0; + + data = early_memremap(efi_setup, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + + if (!data->smbios) + goto out_memremap; + + sz = sizeof(efi_config_table_64_t); + + p = tablep = early_memremap(tables, nr_tables * sz); + if (!p) { + pr_err("Could not map Configuration table!\n"); + ret = -ENOMEM; + goto out_memremap; + } + + for (i = 0; i < efi.systab->nr_tables; i++) { + efi_guid_t guid; + + guid = ((efi_config_table_64_t *)p)->guid; + + if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) + ((efi_config_table_64_t *)p)->table = data->smbios; + p += sz; + } + early_iounmap(tablep, nr_tables * sz); + +out_memremap: + early_iounmap(data, sizeof(*data)); +out: + return ret; +} + +static void get_nr_runtime_map(void) +{ + if (!efi_setup) + return; + + nr_efi_runtime_map = (efi_data_len - sizeof(struct efi_setup_data)) / + sizeof(efi_memory_desc_t); +} + void __init efi_init(void) { efi_char16_t *c16; @@ -638,6 +718,7 @@ void __init efi_init(void) int i = 0; void *tmp; + get_nr_runtime_map(); #ifdef CONFIG_X86_32 if (boot_params.efi_info.efi_systab_hi || boot_params.efi_info.efi_memmap_hi) { @@ -676,6 +757,9 @@ void __init efi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); + if (efi_reuse_config(efi.systab->tables, efi.systab->nr_tables)) + return; + if (efi_config_init(arch_tables)) return; @@ -861,6 +945,23 @@ out: } /* + * Map efi regions which were passed via setup_data. The virt_addr is a fixed + * addr which was used in first kernel of a kexec boot. + */ +static void __init efi_map_regions_fixed(void) +{ + void *p; + efi_memory_desc_t *md; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + efi_map_region_fixed(md); /* FIXME: add error handling */ + get_systab_virt_addr(md); + } + +} + +/* * Map efi memory ranges for runtime serivce and update new_memmap with virtual * addresses. */ @@ -914,6 +1015,10 @@ out: * so that we're in a different address space when calling a runtime * function. For function arguments passing we do copy the PGDs of the * kernel page table into ->trampoline_pgd prior to each call. + * + * Specially for kexec boot, efi runtime maps in previous kernel should + * be passed in via setup_data. In that case runtime ranges will be mapped + * to the same virtual addresses as the first kernel. */ void __init efi_enter_virtual_mode(void) { @@ -932,12 +1037,15 @@ void __init efi_enter_virtual_mode(void) return; } - efi_merge_regions(); - - new_memmap = efi_map_regions(&count); - if (!new_memmap) { - pr_err("Error reallocating memory, EFI runtime non-functional!\n"); - return; + if (efi_setup) { + efi_map_regions_fixed(); + } else { + efi_merge_regions(); + new_memmap = efi_map_regions(&count); + if (!new_memmap) { + pr_err("Error reallocating memory, EFI runtime non-functional!\n"); + return; + } } err = save_runtime_map(); @@ -949,16 +1057,18 @@ void __init efi_enter_virtual_mode(void) efi_setup_page_tables(); efi_sync_low_kernel_mappings(); - status = phys_efi_set_virtual_address_map( - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); - - if (status != EFI_SUCCESS) { - pr_alert("Unable to switch EFI into virtual mode " - "(status=%lx)!\n", status); - panic("EFI call to SetVirtualAddressMap() failed!"); + if (!efi_setup) { + status = phys_efi_set_virtual_address_map( + memmap.desc_size * count, + memmap.desc_size, + memmap.desc_version, + (efi_memory_desc_t *)__pa(new_memmap)); + + if (status != EFI_SUCCESS) { + pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n", + status); + panic("EFI call to SetVirtualAddressMap() failed!"); + } } /* @@ -993,8 +1103,6 @@ void __init efi_enter_virtual_mode(void) EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL); - - return; } /* diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 7b3ec6e..249b183 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -48,6 +48,7 @@ void __init efi_map_region(efi_memory_desc_t *md) } void __init efi_map_region_fixed(efi_memory_desc_t *md) {} +void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} void efi_call_phys_prelog(void) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ff08cb1..324b651 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -228,3 +228,9 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, return (void __iomem *)__va(phys_addr); } + +void __init parse_efi_setup(u64 phys_addr, u32 data_len) +{ + efi_setup = phys_addr + sizeof(struct setup_data); + efi_data_len = data_len - sizeof(struct setup_data); +} -- cgit v1.1 From 456a29ddada79198c5965300e04103c40c481f62 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:20 +0800 Subject: x86: Add xloadflags bit for EFI runtime support on kexec Old kexec-tools can not load new kernels. The reason is kexec-tools does not fill efi_info in x86 setup header previously, thus EFI failed to initialize. In new kexec-tools it will by default to fill efi_info and pass other EFI required infomation to 2nd kernel so kexec kernel EFI initialization can succeed finally. To prevent from breaking userspace, add a new xloadflags bit so kexec-tools can check the flag and switch to old logic. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/boot/header.S | 9 ++++++++- arch/x86/include/uapi/asm/bootparam.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9ec06a1..ec3b8ba 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -391,7 +391,14 @@ xloadflags: #else # define XLF23 0 #endif - .word XLF0 | XLF1 | XLF23 + +#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC) +# define XLF4 XLF_EFI_KEXEC +#else +# define XLF4 0 +#endif + + .word XLF0 | XLF1 | XLF23 | XLF4 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 64fe421..225b098 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -24,6 +24,7 @@ #define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) #define XLF_EFI_HANDOVER_32 (1<<2) #define XLF_EFI_HANDOVER_64 (1<<3) +#define XLF_EFI_KEXEC (1<<4) #ifndef __ASSEMBLY__ -- cgit v1.1 From 5039e316dde3fb71c79e95e97c5bca8e4724d8f2 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:21 +0800 Subject: x86: Export x86 boot_params to sysfs kexec-tools use boot_params for getting the 1st kernel hardware_subarch, the kexec kernel EFI runtime support also needs to read the old efi_info from boot_params. Currently it exists in debugfs which is not a good place for such infomation. Per HPA, we should avoid "sploit debugfs". In this patch /sys/kernel/boot_params are exported, also the setup_data is exported as a subdirectory. kexec-tools is using debugfs for hardware_subarch for a long time now so we're not removing it yet. Structure is like below: /sys/kernel/boot_params |__ data /* boot_params in binary*/ |__ setup_data | |__ 0 /* the first setup_data node */ | | |__ data /* setup_data node 0 in binary*/ | | |__ type /* setup_data type of setup_data node 0, hex string */ [snip] |__ version /* boot protocal version (in hex, "0x" prefixed)*/ Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ksysfs.c | 339 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 340 insertions(+) create mode 100644 arch/x86/kernel/ksysfs.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b0a34e..510cca5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 0000000..eb53d15 --- /dev/null +++ b/arch/x86/kernel/ksysfs.c @@ -0,0 +1,339 @@ +/* + * Architecture specific sysfs attributes in /sys/kernel + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying + * Copyright (C) 2013, 2013 Red Hat, Inc. + * Dave Young + * + * This file is released under the GPLv2 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%04x\n", boot_params.hdr.version); +} + +static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); + +static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, (void *)&boot_params + off, count); + return count; +} + +static struct bin_attribute boot_params_data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = boot_params_data_read, + .size = sizeof(boot_params), +}; + +static struct attribute *boot_params_version_attrs[] = { + &boot_params_version_attr.attr, + NULL, +}; + +static struct bin_attribute *boot_params_data_attrs[] = { + &boot_params_data_attr, + NULL, +}; + +static struct attribute_group boot_params_attr_group = { + .attrs = boot_params_version_attrs, + .bin_attrs = boot_params_data_attrs, +}; + +static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) +{ + const char *name; + + name = kobject_name(kobj); + return kstrtoint(name, 10, nr); +} + +static int get_setup_data_paddr(int nr, u64 *paddr) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + if (nr == i) { + *paddr = pa_data; + return 0; + } + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static int __init get_setup_data_size(int nr, size_t *size) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + if (nr == i) { + *size = data->len; + iounmap(data); + return 0; + } + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int nr, ret; + u64 paddr; + struct setup_data *data; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + ret = sprintf(buf, "0x%x\n", data->type); + iounmap(data); + return ret; +} + +static ssize_t setup_data_data_read(struct file *fp, + struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, + loff_t off, size_t count) +{ + int nr, ret = 0; + u64 paddr; + struct setup_data *data; + void *p; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + if (off > data->len) { + ret = -EINVAL; + goto out; + } + + if (count > data->len - off) + count = data->len - off; + + if (!count) + goto out; + + ret = count; + p = ioremap_cache(paddr + sizeof(*data), data->len); + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(buf, p + off, count); + iounmap(p); +out: + iounmap(data); + return ret; +} + +static struct kobj_attribute type_attr = __ATTR_RO(type); + +static struct bin_attribute data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = setup_data_data_read, +}; + +static struct attribute *setup_data_type_attrs[] = { + &type_attr.attr, + NULL, +}; + +static struct bin_attribute *setup_data_data_attrs[] = { + &data_attr, + NULL, +}; + +static struct attribute_group setup_data_attr_group = { + .attrs = setup_data_type_attrs, + .bin_attrs = setup_data_data_attrs, +}; + +static int __init create_setup_data_node(struct kobject *parent, + struct kobject **kobjp, int nr) +{ + int ret = 0; + size_t size; + struct kobject *kobj; + char name[16]; /* should be enough for setup_data nodes numbers */ + snprintf(name, 16, "%d", nr); + + kobj = kobject_create_and_add(name, parent); + if (!kobj) + return -ENOMEM; + + ret = get_setup_data_size(nr, &size); + if (ret) + goto out_kobj; + + data_attr.size = size; + ret = sysfs_create_group(kobj, &setup_data_attr_group); + if (ret) + goto out_kobj; + *kobjp = kobj; + + return 0; +out_kobj: + kobject_put(kobj); + return ret; +} + +static void __init cleanup_setup_data_node(struct kobject *kobj) +{ + sysfs_remove_group(kobj, &setup_data_attr_group); + kobject_put(kobj); +} + +static int __init get_setup_data_total_num(u64 pa_data, int *nr) +{ + int ret = 0; + struct setup_data *data; + + *nr = 0; + while (pa_data) { + *nr += 1; + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + pa_data = data->next; + iounmap(data); + } + +out: + return ret; +} + +static int __init create_setup_data_nodes(struct kobject *parent) +{ + struct kobject *setup_data_kobj, **kobjp; + u64 pa_data; + int i, j, nr, ret = 0; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return 0; + + setup_data_kobj = kobject_create_and_add("setup_data", parent); + if (!setup_data_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = get_setup_data_total_num(pa_data, &nr); + if (ret) + goto out_setup_data_kobj; + + kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); + if (!kobjp) { + ret = -ENOMEM; + goto out_setup_data_kobj; + } + + for (i = 0; i < nr; i++) { + ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); + if (ret) + goto out_clean_nodes; + } + + kfree(kobjp); + return 0; + +out_clean_nodes: + for (j = i - 1; j > 0; j--) + cleanup_setup_data_node(*(kobjp + j)); + kfree(kobjp); +out_setup_data_kobj: + kobject_put(setup_data_kobj); +out: + return ret; +} + +static int __init boot_params_ksysfs_init(void) +{ + int ret; + struct kobject *boot_params_kobj; + + boot_params_kobj = kobject_create_and_add("boot_params", + kernel_kobj); + if (!boot_params_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); + if (ret) + goto out_boot_params_kobj; + + ret = create_setup_data_nodes(boot_params_kobj); + if (ret) + goto out_create_group; + + return 0; +out_create_group: + sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); +out_boot_params_kobj: + kobject_put(boot_params_kobj); +out: + return ret; +} + +arch_initcall(boot_params_ksysfs_init); -- cgit v1.1 From 77ea8c948953a90401e436e7c05973b2d5529804 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:22 +0800 Subject: x86: Reserve setup_data ranges late after parsing memmap cmdline Currently e820_reserve_setup_data() is called before parsing early params, it works in normal case. But for memmap=exactmap, the final memory ranges are created after parsing memmap= cmdline params, so the previous e820_reserve_setup_data() has no effect. For example, setup_data ranges will still be marked as normal system ram, thus when later sysfs driver ioremap them kernel will warn about mapping normal ram. This patch fix it by moving the e820_reserve_setup_data() callback after parsing early params so they can be set as reserved ranges and later ioremap will be fine with it. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 24536f7..be4b456 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -927,8 +927,6 @@ void __init setup_arch(char **cmdline_p) iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); - /* update the e820_saved too */ - e820_reserve_setup_data(); copy_edd(); @@ -990,6 +988,8 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif + /* update the e820_saved too */ + e820_reserve_setup_data(); finish_e820_parsing(); if (efi_enabled(EFI_BOOT)) -- cgit v1.1 From 518548abd61808ea1e31614ccbdae34d3c32dfa4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Sat, 21 Dec 2013 16:09:46 +0000 Subject: x86/efi: Delete superfluous global variables There's no need to save the runtime map details in global variables, the values are only required to pass to efi_runtime_map_setup(). And because 'nr_efi_runtime_map' isn't needed, get_nr_runtime_map() can be deleted along with 'efi_data_len'. Cc: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 18 +----------------- arch/x86/platform/efi/efi_64.c | 1 - 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 9965ff4..7ed3ecf 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -76,10 +76,7 @@ static __initdata efi_config_table_type_t arch_tables[] = { {NULL_GUID, NULL, NULL}, }; -static void *efi_runtime_map; -static int nr_efi_runtime_map; u64 efi_setup; /* efi setup_data physical address */ -u32 efi_data_len; /* efi setup_data payload length */ /* * Returns 1 if 'facility' is enabled, 0 otherwise. @@ -702,15 +699,6 @@ out: return ret; } -static void get_nr_runtime_map(void) -{ - if (!efi_setup) - return; - - nr_efi_runtime_map = (efi_data_len - sizeof(struct efi_setup_data)) / - sizeof(efi_memory_desc_t); -} - void __init efi_init(void) { efi_char16_t *c16; @@ -718,7 +706,6 @@ void __init efi_init(void) int i = 0; void *tmp; - get_nr_runtime_map(); #ifdef CONFIG_X86_32 if (boot_params.efi_info.efi_systab_hi || boot_params.efi_info.efi_memmap_hi) { @@ -933,10 +920,7 @@ static int __init save_runtime_map(void) count++; } - efi_runtime_map = q; - nr_efi_runtime_map = count; - efi_runtime_map_setup(efi_runtime_map, nr_efi_runtime_map, - boot_params.efi_info.efi_memdesc_size); + efi_runtime_map_setup(q, count, memmap.desc_size); return 0; out: diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 324b651..6284f15 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -232,5 +232,4 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, void __init parse_efi_setup(u64 phys_addr, u32 data_len) { efi_setup = phys_addr + sizeof(struct setup_data); - efi_data_len = data_len - sizeof(struct setup_data); } -- cgit v1.1 From 41a34cec2e0df7798ea322ed1480bc3d3facdc8e Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 3 Jan 2014 11:54:31 +0800 Subject: x86: ksysfs.c build fix kbuild test robot report below error for randconfig: arch/x86/kernel/ksysfs.c: In function 'get_setup_data_paddr': arch/x86/kernel/ksysfs.c:81:3: error: implicit declaration of function 'ioremap_cache' [-Werror=implicit-function-declaration] arch/x86/kernel/ksysfs.c:86:3: error: implicit declaration of function 'iounmap' [-Werror=implicit-function-declaration] Fix it by including in ksysfs.c Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/kernel/ksysfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index eb53d15..c2bedae 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -17,6 +17,7 @@ #include #include +#include #include static ssize_t version_show(struct kobject *kobj, -- cgit v1.1 From 5c12af0c41e3417e1939095325920463b5f8e726 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 3 Jan 2014 11:56:49 +0800 Subject: x86/efi: parse_efi_setup() build fix In case without CONFIG_EFI, there will be below build error: arch/x86/built-in.o: In function `setup_arch': (.init.text+0x9dc): undefined reference to `parse_efi_setup' Thus fix it by adding blank inline function in asm/efi.h Also remove an unused declaration for variable efi_data_len. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 4d1ba80..3b978c4 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -142,8 +142,6 @@ struct efi_setup_data { }; extern u64 efi_setup; -extern u32 efi_data_len; -extern void parse_efi_setup(u64 phys_addr, u32 data_len); #ifdef CONFIG_EFI @@ -153,7 +151,7 @@ static inline bool efi_is_native(void) } extern struct console early_efi_console; - +extern void parse_efi_setup(u64 phys_addr, u32 data_len); #else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. @@ -165,6 +163,7 @@ extern struct console early_efi_console; #define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) #define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) #define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) +static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ -- cgit v1.1 From dd360393f4d948eb518372316e52101cf3b44212 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 23 Dec 2013 14:16:58 +0200 Subject: x86, cpu: Detect more TLB configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Intel Software Developer’s Manual covers few more TLB configurations exposed as CPUID 2 descriptors: 61H Instruction TLB: 4 KByte pages, fully associative, 48 entries 63H Data TLB: 1 GByte pages, 4-way set associative, 4 entries 76H Instruction TLB: 2M/4M pages, fully associative, 8 entries B5H Instruction TLB: 4KByte pages, 8-way set associative, 64 entries B6H Instruction TLB: 4KByte pages, 8-way set associative, 128 entries C1H Shared 2nd-Level TLB: 4 KByte/2MByte pages, 8-way associative, 1024 entries C2H DTLB DTLB: 2 MByte/$MByte pages, 4-way associative, 16 entries Let's detect them as well. Signed-off-by: Kirill A. Shutemov Link: http://lkml.kernel.org/r/1387801018-14499-1-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 7 ++++--- arch/x86/kernel/cpu/intel.c | 26 ++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4..1dd6260 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -72,6 +72,7 @@ extern u16 __read_mostly tlb_lli_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_4k[NR_INFO]; extern u16 __read_mostly tlb_lld_2m[NR_INFO]; extern u16 __read_mostly tlb_lld_4m[NR_INFO]; +extern u16 __read_mostly tlb_lld_1g[NR_INFO]; extern s8 __read_mostly tlb_flushall_shift; /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6abc172..24b6fd1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -472,6 +472,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO]; u16 __read_mostly tlb_lld_4k[NR_INFO]; u16 __read_mostly tlb_lld_2m[NR_INFO]; u16 __read_mostly tlb_lld_4m[NR_INFO]; +u16 __read_mostly tlb_lld_1g[NR_INFO]; /* * tlb_flushall_shift shows the balance point in replacing cr3 write @@ -486,13 +487,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c) if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); - printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ - "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ + printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" "tlb_flushall_shift: %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], - tlb_flushall_shift); + tlb_lld_1g[ENTRIES], tlb_flushall_shift); } void detect_ht(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ea04b34..5eb7ea5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -506,6 +506,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) #define TLB_DATA0_2M_4M 0x23 #define STLB_4K 0x41 +#define STLB_4K_2M 0x42 static const struct _tlb_table intel_tlb_table[] = { { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, @@ -526,13 +527,20 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, + { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" }, + { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" }, { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, + { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, { 0x00, 0, 0 } }; @@ -558,6 +566,20 @@ static void intel_tlb_lookup(const unsigned char desc) if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; break; + case STLB_4K_2M: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; case TLB_INST_ALL: if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; @@ -603,6 +625,10 @@ static void intel_tlb_lookup(const unsigned char desc) if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; break; + case TLB_DATA_1G: + if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + break; } } -- cgit v1.1 From df90ca969035d3f6c95044e272f75bf417b14245 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 3 Jan 2014 16:45:00 -0500 Subject: x86, sparse: Do not force removal of __user when calling copy_to/from_user_nocheck() Commit ff47ab4ff3cdd "x86: Add 1/2/4/8 byte optimization to 64bit __copy_{from,to}_user_inatomic" added a "_nocheck" call in between the copy_to/from_user() and copy_user_generic(). As both the normal and nocheck versions of theses calls use the proper __user annotation, a typecast to remove it should not be added. This causes sparse to spin out the following warnings: arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces) arch/x86/include/asm/uaccess_64.h:207:47: expected void const [noderef] *src arch/x86/include/asm/uaccess_64.h:207:47: got void const * arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces) arch/x86/include/asm/uaccess_64.h:207:47: expected void const [noderef] *src arch/x86/include/asm/uaccess_64.h:207:47: got void const * arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces) arch/x86/include/asm/uaccess_64.h:207:47: expected void const [noderef] *src arch/x86/include/asm/uaccess_64.h:207:47: got void const * arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces) arch/x86/include/asm/uaccess_64.h:207:47: expected void const [noderef] *src arch/x86/include/asm/uaccess_64.h:207:47: got void const * Cc: Andi Kleen Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/r/20140103164500.5f6478f5@gandalf.local.home Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 190413d..12a26b9 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -204,13 +204,13 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) static __must_check __always_inline int __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) { - return __copy_from_user_nocheck(dst, (__force const void *)src, size); + return __copy_from_user_nocheck(dst, src, size); } static __must_check __always_inline int __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) { - return __copy_to_user_nocheck((__force void *)dst, src, size); + return __copy_to_user_nocheck(dst, src, size); } extern long __copy_user_nocache(void *dst, const void __user *src, -- cgit v1.1 From e24df921af484e9b7095ee06d236dabdb3bd9c1c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 3 Jan 2014 22:30:42 +0000 Subject: x86, boot: Use .code16 instead of .code16gcc This reverts commit 28b48688 ("x86, boot: use .code16gcc instead of .code16"). Versions of binutils older than 2.16 are already not working, so this workaround is no longer necessary either. At the same time, some of the transformations that .code16gcc does can be *extremely* counterintuitive to a human programmer. [ hpa: folded ret -> retl and call -> calll fixes from followup patch ] Signed-off-by: David Woodhouse Link: http://lkml.kernel.org/r/1388788242.2391.75.camel@shinybook.infradead.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/bioscall.S | 2 +- arch/x86/boot/copy.S | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 1dfbf64..5077937 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -13,7 +13,7 @@ * touching registers they shouldn't be. */ - .code16gcc + .code16 .text .globl intcall .type intcall, @function diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 11f272c..1eb7d29 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -14,7 +14,7 @@ * Memory copy routines */ - .code16gcc + .code16 .text GLOBAL(memcpy) @@ -30,7 +30,7 @@ GLOBAL(memcpy) rep; movsb popw %di popw %si - ret + retl ENDPROC(memcpy) GLOBAL(memset) @@ -45,25 +45,25 @@ GLOBAL(memset) andw $3, %cx rep; stosb popw %di - ret + retl ENDPROC(memset) GLOBAL(copy_from_fs) pushw %ds pushw %fs popw %ds - call memcpy + calll memcpy popw %ds - ret + retl ENDPROC(copy_from_fs) GLOBAL(copy_to_fs) pushw %es pushw %fs popw %es - call memcpy + calll memcpy popw %es - ret + retl ENDPROC(copy_to_fs) #if 0 /* Not currently used, but can be enabled as needed */ @@ -71,17 +71,17 @@ GLOBAL(copy_from_gs) pushw %ds pushw %gs popw %ds - call memcpy + calll memcpy popw %ds - ret + retl ENDPROC(copy_from_gs) GLOBAL(copy_to_gs) pushw %es pushw %gs popw %es - call memcpy + calll memcpy popw %es - ret + retl ENDPROC(copy_to_gs) #endif -- cgit v1.1 From 6f34152f54bcf63a49ea701643c81f5b2168fec8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 4 Jan 2014 14:29:08 -0800 Subject: x86, boot: Move intcall() to the .inittext section The .inittext section tries to aggregate all functions which are needed to get a message out in the case of a load failure. However, putchar() uses intcall(), so intcall() should be in the .inittext section. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-twxm8igouzbmsklmf6lfyq0w@git.kernel.org --- arch/x86/boot/bioscall.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 5077937..d401b4a 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------- * - * Copyright 2009 Intel Corporation; author H. Peter Anvin + * Copyright 2009-2014 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2 or (at your @@ -14,7 +14,7 @@ */ .code16 - .text + .section ".inittext","ax" .globl intcall .type intcall, @function intcall: -- cgit v1.1 From 663b55b9b39fa9c848cca273ca4e12bf29b32c71 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 6 Jan 2014 19:20:26 -0500 Subject: x86: Delete non-required instances of include None of these files are actually using any __init type directives and hence don't need to include . Most are just a left over from __devinit and __cpuinit removal, or simply due to code getting copied from one driver to the next. [ hpa: undid incorrect removal from arch/x86/kernel/head_32.S ] Signed-off-by: Paul Gortmaker Link: http://lkml.kernel.org/r/1389054026-12947-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 - arch/x86/include/asm/mpspec.h | 1 - arch/x86/include/asm/processor.h | 1 - arch/x86/include/asm/ptrace.h | 1 - arch/x86/include/asm/smp.h | 1 - arch/x86/include/asm/timer.h | 1 - arch/x86/kernel/apic/apic_flat_64.c | 1 - arch/x86/kernel/apic/apic_noop.c | 1 - arch/x86/kernel/apic/ipi.c | 1 - arch/x86/kernel/apic/summit_32.c | 1 - arch/x86/kernel/apic/x2apic_cluster.c | 1 - arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/cpu/amd.c | 1 - arch/x86/kernel/cpu/centaur.c | 1 - arch/x86/kernel/cpu/cyrix.c | 1 - arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 - arch/x86/kernel/cpu/mcheck/p5.c | 1 - arch/x86/kernel/cpu/mcheck/winchip.c | 1 - arch/x86/kernel/cpu/transmeta.c | 1 - arch/x86/kernel/cpu/umc.c | 1 - arch/x86/kernel/crash.c | 1 - arch/x86/kernel/doublefault.c | 1 - arch/x86/kernel/hw_breakpoint.c | 1 - arch/x86/kernel/kgdb.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/pci-nommu.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/tsc_sync.c | 1 - arch/x86/lib/delay.c | 1 - arch/x86/mm/kmmio.c | 1 - arch/x86/mm/pageattr-test.c | 1 - arch/x86/pci/fixup.c | 1 - arch/x86/platform/intel-mid/early_printk_intel_mid.c | 1 - arch/x86/platform/iris/iris.c | 1 - arch/x86/realmode/rm/reboot.S | 1 - arch/x86/realmode/rm/trampoline_32.S | 1 - arch/x86/realmode/rm/trampoline_64.S | 1 - arch/x86/um/vdso/vdso.S | 1 - arch/x86/vdso/vdso.S | 1 - arch/x86/vdso/vdsox32.S | 1 - 41 files changed, 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c696a86..6e4ce2d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -118,7 +118,6 @@ extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); #include -#include #include extern int mce_p5_enabled; diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 3142a94..3e6b492 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H -#include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4..8ade617 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -27,7 +27,6 @@ struct mm_struct; #include #include #include -#include #include #include diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 942a086..14fd6fd 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -60,7 +60,6 @@ struct pt_regs { #endif /* !__i386__ */ -#include #ifdef CONFIG_PARAVIRT #include #endif diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4137890..8cd27e0 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -2,7 +2,6 @@ #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include -#include #include /* diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0e..a6f3e77 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -1,6 +1,5 @@ #ifndef _ASM_X86_TIMER_H #define _ASM_X86_TIMER_H -#include #include #include #include diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 00c77cf..5d5b9eb 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e145f28..191ce75 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 7434d85..6207156 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,6 +1,5 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 77c95c0..00146f9 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -29,7 +29,6 @@ #define pr_fmt(fmt) "summit: %s: " fmt, __func__ #include -#include #include #include diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 140e29d..cac85ee 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 562a76d..de231e3 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023b..39bc78d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,5 +1,4 @@ #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 8d5652d..8779eda 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,6 +1,5 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d0969c7..aaf152e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,4 +1,3 @@ -#include #include #include #include diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index dc1ec0d..53f5d3c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,4 +1,3 @@ -#include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 4cfe045..fb6156f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -6,7 +6,6 @@ */ #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 1c044b1..a304298 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index e9a701a..7dc5564 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index aa0430d..3fa0e5a 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include "cpu.h" diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index 75c5ad5..ef9c2a0 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -1,5 +1,4 @@ #include -#include #include #include "cpu.h" diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 18677a9..a57902e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -7,7 +7,6 @@ * */ -#include #include #include #include diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 5d3fe8d..f6dfd93 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -1,6 +1,5 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f66ff16..a67b47c 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 836f832..7ec1d5f 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5b19e4d..1667b1d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 871be4a..da15918 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6f1236c..0de43e9 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index adfdf56..2648848 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -16,7 +16,6 @@ */ #include #include -#include #include #include #include diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 7c3bee6..39d6a3d 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index e5d5e2c..637ab34 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index d0b1773..461bc82 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b046e07..bca9e85 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c index 4f702f5..e0bd082 100644 --- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c +++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index e6cb80f..4d171e8 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index f932ea6..d66c607 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -1,5 +1,4 @@ #include -#include #include #include #include diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index c1b2791..48ddd76 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -20,7 +20,6 @@ */ #include -#include #include #include #include "realmode.h" diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index bb360dc..dac7b20 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -25,7 +25,6 @@ */ #include -#include #include #include #include diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S index 1cb468a..4b4bd4c 100644 --- a/arch/x86/um/vdso/vdso.S +++ b/arch/x86/um/vdso/vdso.S @@ -1,4 +1,3 @@ -#include __INITDATA diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 01f5e3b..1e13eb8 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -1,6 +1,5 @@ #include #include -#include __PAGE_ALIGNED_DATA diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S index d6b9a7f..295f1c7 100644 --- a/arch/x86/vdso/vdsox32.S +++ b/arch/x86/vdso/vdsox32.S @@ -1,6 +1,5 @@ #include #include -#include __PAGE_ALIGNED_DATA -- cgit v1.1 From 46184415368a6095d5da33991c5e011f1084353d Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 8 Jan 2014 13:27:51 -0800 Subject: arch: x86: New MailBox support driver for Intel SOC's Current Intel SOC cores use a MailBox Interface (MBI) to provide access to configuration registers on devices (called units) connected to the system fabric. This is a support driver that implements access to this interface on those platforms that can enumerate the device using PCI. Initial support is for BayTrail, for which port definitons are provided. This is a requirement for implementing platform specific features (e.g. RAPL driver requires this to perform platform specific power management using the registers in PUNIT). Dependant modules should select IOSF_MBI in their respective Kconfig configuraiton. Serialized access is handled by all exported routines with spinlocks. The API includes 3 functions for access to unit registers: int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) port: indicating the unit being accessed opcode: the read or write port specific opcode offset: the register offset within the port mdr: the register data to be read, written, or modified mask: bit locations in mdr to change Returns nonzero on error Note: GPU code handles access to the GFX unit. Therefore access to that unit with this driver is disallowed to avoid conflicts. Signed-off-by: David E. Box Link: http://lkml.kernel.org/r/1389216471-734-1-git-send-email-david.e.box@linux.intel.com Signed-off-by: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Matthew Garrett --- arch/x86/Kconfig | 8 ++ arch/x86/include/asm/iosf_mbi.h | 90 ++++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/iosf_mbi.c | 226 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 325 insertions(+) create mode 100644 arch/x86/include/asm/iosf_mbi.h create mode 100644 arch/x86/kernel/iosf_mbi.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0952ecd..ca5959a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2393,6 +2393,14 @@ config X86_DMA_REMAP bool depends on STA2X11 +config IOSF_MBI + bool + depends on PCI + ---help--- + To be selected by modules requiring access to the Intel OnChip System + Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms + enumerable by PCI. + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h new file mode 100644 index 0000000..8e71c79 --- /dev/null +++ b/arch/x86/include/asm/iosf_mbi.h @@ -0,0 +1,90 @@ +/* + * iosf_mbi.h: Intel OnChip System Fabric MailBox access support + */ + +#ifndef IOSF_MBI_SYMS_H +#define IOSF_MBI_SYMS_H + +#define MBI_MCR_OFFSET 0xD0 +#define MBI_MDR_OFFSET 0xD4 +#define MBI_MCRX_OFFSET 0xD8 + +#define MBI_RD_MASK 0xFEFFFFFF +#define MBI_WR_MASK 0X01000000 + +#define MBI_MASK_HI 0xFFFFFF00 +#define MBI_MASK_LO 0x000000FF +#define MBI_ENABLE 0xF0 + +/* Baytrail available units */ +#define BT_MBI_UNIT_AUNIT 0x00 +#define BT_MBI_UNIT_SMC 0x01 +#define BT_MBI_UNIT_CPU 0x02 +#define BT_MBI_UNIT_BUNIT 0x03 +#define BT_MBI_UNIT_PMC 0x04 +#define BT_MBI_UNIT_GFX 0x06 +#define BT_MBI_UNIT_SMI 0x0C +#define BT_MBI_UNIT_USB 0x43 +#define BT_MBI_UNIT_SATA 0xA3 +#define BT_MBI_UNIT_PCIE 0xA6 + +/* Baytrail read/write opcodes */ +#define BT_MBI_AUNIT_READ 0x10 +#define BT_MBI_AUNIT_WRITE 0x11 +#define BT_MBI_SMC_READ 0x10 +#define BT_MBI_SMC_WRITE 0x11 +#define BT_MBI_CPU_READ 0x10 +#define BT_MBI_CPU_WRITE 0x11 +#define BT_MBI_BUNIT_READ 0x10 +#define BT_MBI_BUNIT_WRITE 0x11 +#define BT_MBI_PMC_READ 0x06 +#define BT_MBI_PMC_WRITE 0x07 +#define BT_MBI_GFX_READ 0x00 +#define BT_MBI_GFX_WRITE 0x01 +#define BT_MBI_SMIO_READ 0x06 +#define BT_MBI_SMIO_WRITE 0x07 +#define BT_MBI_USB_READ 0x06 +#define BT_MBI_USB_WRITE 0x07 +#define BT_MBI_SATA_READ 0x00 +#define BT_MBI_SATA_WRITE 0x01 +#define BT_MBI_PCIE_READ 0x00 +#define BT_MBI_PCIE_WRITE 0x01 + +/** + * iosf_mbi_read() - MailBox Interface read command + * @port: port indicating subunit being accessed + * @opcode: port specific read or write opcode + * @offset: register address offset + * @mdr: register data to be read + * + * Locking is handled by spinlock - cannot sleep. + * Return: Nonzero on error + */ +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr); + +/** + * iosf_mbi_write() - MailBox unmasked write command + * @port: port indicating subunit being accessed + * @opcode: port specific read or write opcode + * @offset: register address offset + * @mdr: register data to be written + * + * Locking is handled by spinlock - cannot sleep. + * Return: Nonzero on error + */ +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr); + +/** + * iosf_mbi_modify() - MailBox masked write command + * @port: port indicating subunit being accessed + * @opcode: port specific read or write opcode + * @offset: register address offset + * @mdr: register data being modified + * @mask: mask indicating bits in mdr to be modified + * + * Locking is handled by spinlock - cannot sleep. + * Return: Nonzero on error + */ +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); + +#endif /* IOSF_MBI_SYMS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b0a34e..dbe9bd6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -111,6 +111,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o ### # 64 bit specific files diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c new file mode 100644 index 0000000..c3aae66 --- /dev/null +++ b/arch/x86/kernel/iosf_mbi.c @@ -0,0 +1,226 @@ +/* + * IOSF-SB MailBox Interface Driver + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * + * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a + * mailbox interface (MBI) to communicate with mutiple devices. This + * driver implements access to this interface for those platforms that can + * enumerate the device using PCI. + */ + +#include +#include +#include +#include + +#include + +static DEFINE_SPINLOCK(iosf_mbi_lock); + +static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) +{ + return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE; +} + +static struct pci_dev *mbi_pdev; /* one mbi device */ + +static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr) +{ + int result; + + if (!mbi_pdev) + return -ENODEV; + + if (mcrx) { + result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, + mcrx); + if (result < 0) + goto fail_read; + } + + result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); + if (result < 0) + goto fail_read; + + result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); + if (result < 0) + goto fail_read; + + return 0; + +fail_read: + dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); + return result; +} + +static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr) +{ + int result; + + if (!mbi_pdev) + return -ENODEV; + + result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); + if (result < 0) + goto fail_write; + + if (mcrx) { + result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, + mcrx); + if (result < 0) + goto fail_write; + } + + result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); + if (result < 0) + goto fail_write; + + return 0; + +fail_write: + dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); + return result; +} + +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) +{ + u32 mcr, mcrx; + unsigned long flags; + int ret; + + /*Access to the GFX unit is handled by GPU code */ + if (port == BT_MBI_UNIT_GFX) { + WARN_ON(1); + return -EPERM; + } + + mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); + mcrx = offset & MBI_MASK_HI; + + spin_lock_irqsave(&iosf_mbi_lock, flags); + ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr); + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_read); + +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) +{ + u32 mcr, mcrx; + unsigned long flags; + int ret; + + /*Access to the GFX unit is handled by GPU code */ + if (port == BT_MBI_UNIT_GFX) { + WARN_ON(1); + return -EPERM; + } + + mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); + mcrx = offset & MBI_MASK_HI; + + spin_lock_irqsave(&iosf_mbi_lock, flags); + ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr); + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_write); + +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) +{ + u32 mcr, mcrx; + u32 value; + unsigned long flags; + int ret; + + /*Access to the GFX unit is handled by GPU code */ + if (port == BT_MBI_UNIT_GFX) { + WARN_ON(1); + return -EPERM; + } + + mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); + mcrx = offset & MBI_MASK_HI; + + spin_lock_irqsave(&iosf_mbi_lock, flags); + + /* Read current mdr value */ + ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value); + if (ret < 0) { + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + return ret; + } + + /* Apply mask */ + value &= ~mask; + mdr &= mask; + value |= mdr; + + /* Write back */ + ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value); + + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_modify); + +static int iosf_mbi_probe(struct pci_dev *pdev, + const struct pci_device_id *unused) +{ + int ret; + + ret = pci_enable_device(pdev); + if (ret < 0) { + dev_err(&pdev->dev, "error: could not enable device\n"); + return ret; + } + + mbi_pdev = pci_dev_get(pdev); + return 0; +} + +static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, + { 0, }, +}; +MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); + +static struct pci_driver iosf_mbi_pci_driver = { + .name = "iosf_mbi_pci", + .probe = iosf_mbi_probe, + .id_table = iosf_mbi_pci_ids, +}; + +static int __init iosf_mbi_init(void) +{ + return pci_register_driver(&iosf_mbi_pci_driver); +} + +static void __exit iosf_mbi_exit(void) +{ + pci_unregister_driver(&iosf_mbi_pci_driver); + if (mbi_pdev) { + pci_dev_put(mbi_pdev); + mbi_pdev = NULL; + } +} + +module_init(iosf_mbi_init); +module_exit(iosf_mbi_exit); + +MODULE_AUTHOR("David E. Box "); +MODULE_DESCRIPTION("IOSF Mailbox Interface accessor"); +MODULE_LICENSE("GPL v2"); -- cgit v1.1 From 0c3351d451ae2fa438d5d1ed719fc43354fbffbb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 2 Jan 2014 15:11:13 -0800 Subject: seqlock: Use raw_ prefix instead of _no_lockdep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linus disliked the _no_lockdep() naming, so instead use the more-consistent raw_* prefix to the non-lockdep enabled seqcount methods. This also adds raw_ methods for the write operations as well, which will be utilized in a following patch. Acked-by: Linus Torvalds Reviewed-by: Stephen Boyd Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra Cc: Krzysztof HaÅ‚asa Cc: Uwe Kleine-König Cc: Willy Tarreau Link: http://lkml.kernel.org/r/1388704274-5278-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- arch/x86/vdso/vclock_gettime.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 2ada505..eb5d7a5 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -178,7 +178,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) ts->tv_nsec = 0; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; @@ -198,7 +198,7 @@ notrace static int do_monotonic(struct timespec *ts) ts->tv_nsec = 0; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; @@ -214,7 +214,7 @@ notrace static int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -225,7 +225,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); -- cgit v1.1 From f228c5b882602697a1adb50d61ff688b0df1eced Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 8 Jan 2014 11:15:53 +0100 Subject: perf/x86/intel: Add Intel RAPL PP1 energy counter support This patch adds support for the Intel RAPL energy counter PP1 (Power Plane 1). On client processors, it usually corresponds to the energy consumption of the builtin graphic card. That is why the sysfs event is called energy-gpu. New event: - name: power/energy-gpu/ - code: event=0x4 - unit: 2^-32 Joules On processors without graphics, this should count 0. The patch only enables this event on client processors. Reviewed-by: Maria Dimakopoulou Signed-off-by: Stephane Eranian Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: vincent.weaver@maine.edu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389176153-3128-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 31 ++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 0e3754e..5ad35ad 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -27,6 +27,10 @@ * event: rapl_energy_dram * perf code: 0x3 * + * dram counter: consumption of the builtin-gpu domain (client only) + * event: rapl_energy_gpu + * perf code: 0x4 + * * We manage those counters as free running (read-only). They may be * use simultaneously by other tools, such as turbostat. * @@ -55,10 +59,13 @@ #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ /* Clients have PP0, PKG */ #define RAPL_IDX_CLN (1< Date: Wed, 6 Nov 2013 14:57:36 +0100 Subject: arch: Introduce smp_load_acquire(), smp_store_release() A number of situations currently require the heavyweight smp_mb(), even though there is no need to order prior stores against later loads. Many architectures have much cheaper ways to handle these situations, but the Linux kernel currently has no portable way to make use of them. This commit therefore supplies smp_load_acquire() and smp_store_release() to remedy this situation. The new smp_load_acquire() primitive orders the specified load against any subsequent reads or writes, while the new smp_store_release() primitive orders the specifed store against any prior reads or writes. These primitives allow array-based circular FIFOs to be implemented without an smp_mb(), and also allow a theoretical hole in rcu_assign_pointer() to be closed at no additional expense on most architectures. In addition, the RCU experience transitioning from explicit smp_read_barrier_depends() and smp_wmb() to rcu_dereference() and rcu_assign_pointer(), respectively resulted in substantial improvements in readability. It therefore seems likely that replacing other explicit barriers with smp_load_acquire() and smp_store_release() will provide similar benefits. It appears that roughly half of the explicit barriers in core kernel code might be so replaced. [Changelog by PaulMck] Reviewed-by: "Paul E. McKenney" Signed-off-by: Peter Zijlstra Acked-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: Russell King Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Victor Kaplansky Cc: Tony Luck Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20131213150640.908486364@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/barrier.h | 43 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index c6cd358..04a4890 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -92,12 +92,53 @@ #endif #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else +#else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() #define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif /* SMP */ + +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + +/* + * For either of these options x86 doesn't have a strong TSO memory + * model and we should fall back to full barriers. + */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + +#else /* regular x86 TSO memory ordering */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* -- cgit v1.1 From 9345005f4eed805308193658d12e4e7e9c261e74 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Sun, 5 Jan 2014 11:10:52 -0500 Subject: x86/irq: Fix do_IRQ() interrupt warning for cpu hotplug retriggered irqs During heavy CPU-hotplug operations the following spurious kernel warnings can trigger: do_IRQ: No ... irq handler for vector (irq -1) [ See: https://bugzilla.kernel.org/show_bug.cgi?id=64831 ] When downing a cpu it is possible that there are unhandled irqs left in the APIC IRR register. The following code path shows how the problem can occur: 1. CPU 5 is to go down. 2. cpu_disable() on CPU 5 executes with interrupt flag cleared by local_irq_save() via stop_machine(). 3. IRQ 12 asserts on CPU 5, setting IRR but not ISR because interrupt flag is cleared (CPU unabled to handle the irq) 4. IRQs are migrated off of CPU 5, and the vectors' irqs are set to -1. 5. stop_machine() finishes cpu_disable() 6. cpu_die() for CPU 5 executes in normal context. 7. CPU 5 attempts to handle IRQ 12 because the IRR is set for IRQ 12. The code attempts to find the vector's IRQ and cannot because it has been set to -1. 8. do_IRQ() warning displays warning about CPU 5 IRQ 12. I added a debug printk to output which CPU & vector was retriggered and discovered that that we are getting bogus events. I see a 100% correlation between this debug printk in fixup_irqs() and the do_IRQ() warning. This patchset resolves this by adding definitions for VECTOR_UNDEFINED(-1) and VECTOR_RETRIGGERED(-2) and modifying the code to use them. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=64831 Signed-off-by: Prarit Bhargava Reviewed-by: Rui Wang Cc: Michel Lespinasse Cc: Seiji Aguchi Cc: Yang Zhang Cc: Paul Gortmaker Cc: janet.morgan@Intel.com Cc: tony.luck@Intel.com Cc: ruiv.wang@gmail.com Link: http://lkml.kernel.org/r/1388938252-16627-1-git-send-email-prarit@redhat.com [ Cleaned up the code a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 3 +++ arch/x86/kernel/apic/io_apic.c | 18 +++++++++--------- arch/x86/kernel/irq.c | 19 +++++++++++++------ arch/x86/kernel/irqinit.c | 4 ++-- 4 files changed, 27 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index cba45d9..67d69b8 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -191,6 +191,9 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); #define trace_interrupt interrupt #endif +#define VECTOR_UNDEFINED -1 +#define VECTOR_RETRIGGERED -2 + typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); extern void setup_vector_irq(int cpu); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e63a5bd..6df0b66 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1142,9 +1142,10 @@ next: if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; + } /* Found one! */ current_vector = vector; current_offset = offset; @@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) vector = cfg->vector; for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; cfg->vector = 0; cpumask_clear(cfg->domain); @@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) continue; - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; break; } } @@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu) /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) + if (irq <= VECTOR_UNDEFINED) continue; cfg = irq_cfg(irq); if (!cpumask_test_cpu(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); } @@ -2208,7 +2208,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_cfg *cfg; irq = __this_cpu_read(vector_irq[vector]); - if (irq == -1) + if (irq <= VECTOR_UNDEFINED) continue; desc = irq_to_desc(irq); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 22d0687..884d875 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) if (!handle_irq(irq, regs)) { ack_APIC_irq(); - if (printk_ratelimit()) - pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + if (irq != VECTOR_RETRIGGERED) { + pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), + vector, irq); + } else { + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + } } irq_exit(); @@ -344,7 +348,7 @@ void fixup_irqs(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; - if (__this_cpu_read(vector_irq[vector]) < 0) + if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -355,11 +359,14 @@ void fixup_irqs(void) data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); raw_spin_lock(&desc->lock); - if (chip->irq_retrigger) + if (chip->irq_retrigger) { chip->irq_retrigger(data); + __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); + } raw_spin_unlock(&desc->lock); } - __this_cpu_write(vector_irq[vector], -1); + if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); } } #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a2a1fbc..7f50156 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -52,7 +52,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... NR_VECTORS - 1] = -1, + [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector) int cpu; for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) + if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED) return 1; } -- cgit v1.1 From 4f75d8412792777a314ac5c1393a9ed43d695fd1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 23 Dec 2013 18:05:02 +0100 Subject: x86, mce: Fix mce_start_timer semantics So mce_start_timer() has a 'cpu' argument which is supposed to mean to start a timer on that cpu. However, the code currently starts a timer on the *current* cpu the function runs on and causes the sanity-check in mce_timer_fn to fire: WARNING: CPU: 0 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:1286 mce_timer_fn because it is running on the wrong cpu. This was triggered by Prarit Bhargava by offlining all the cpus in succession. Then, we were fiddling with the CMCI storm settings when starting the timer whereas there's no need for that - if there's storm happening on this newly restarted cpu, we're going to be in normal CMCI mode initially and then when the CMCI interrupt starts firing, we're going to go to the polling mode with the timer real soon. Signed-off-by: Borislav Petkov Tested-by: Prarit Bhargava Cc: Tony Luck Reviewed-by: Chen, Gong Link: http://lkml.kernel.org/r/1387722156-5511-1-git-send-email-prarit@redhat.com --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a389c1d..4d5419b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - unsigned long iv = mce_adjust_timer(check_interval * HZ); - - __this_cpu_write(mce_next_interval, iv); + unsigned long iv = check_interval * HZ; if (mca_cfg.ignore_ce || !iv) return; + per_cpu(mce_next_interval, cpu) = iv; + t->expires = round_jiffies(jiffies + iv); - add_timer_on(t, smp_processor_id()); + add_timer_on(t, cpu); } static void __mcheck_cpu_init_timer(void) -- cgit v1.1 From 106553aa4ead528b1bab37fb3cc47004a4451acc Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 12 Jan 2014 15:52:21 +0100 Subject: um, x86: Fix vDSO build Commit 663b55b9b39 ("x86: Delete non-required instances of include ") broke the UML build. arch/x86/um/vdso/vdso.S: Assembler messages: arch/x86/um/vdso/vdso.S:2: Error: no such instruction: `__initdata' arch/x86/um/vdso/vdso.S:9: Error: no such instruction: `__finit' UML's vDSO needs linux/init.h. Reported-by: Ingo Molnar Signed-off-by: Richard Weinberger Cc: user-mode-linux-devel@lists.sourceforge.net Cc: paul.gortmaker@windriver.com Link: http://lkml.kernel.org/r/1389538341-31383-1-git-send-email-richard@nod.at Signed-off-by: Ingo Molnar --- arch/x86/um/vdso/vdso.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S index 4b4bd4c..1cb468a 100644 --- a/arch/x86/um/vdso/vdso.S +++ b/arch/x86/um/vdso/vdso.S @@ -1,3 +1,4 @@ +#include __INITDATA -- cgit v1.1 From d50dde5a10f305253cbc3855307f608f8a3c5f73 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:36 +0100 Subject: sched: Add new scheduler syscalls to support an extended scheduling parameters ABI Add the syscalls needed for supporting scheduling algorithms with extended scheduling parameters (e.g., SCHED_DEADLINE). In general, it makes possible to specify a periodic/sporadic task, that executes for a given amount of runtime at each instance, and is scheduled according to the urgency of their own timing constraints, i.e.: - a (maximum/typical) instance execution time, - a minimum interval between consecutive instances, - a time constraint by which each instance must be completed. Thus, both the data structure that holds the scheduling parameters of the tasks and the system calls dealing with it must be extended. Unfortunately, modifying the existing struct sched_param would break the ABI and result in potentially serious compatibility issues with legacy binaries. For these reasons, this patch: - defines the new struct sched_attr, containing all the fields that are necessary for specifying a task in the computational model described above; - defines and implements the new scheduling related syscalls that manipulate it, i.e., sched_setattr() and sched_getattr(). Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a proof of concept and for developing and testing purposes. Making them available on other architectures is straightforward. Since no "user" for these new parameters is introduced in this patch, the implementation of the new system calls is just identical to their already existing counterpart. Future patches that implement scheduling policies able to exploit the new data structure must also take care of modifying the sched_*attr() calls accordingly with their own purposes. Signed-off-by: Dario Faggioli [ Rewrote to use sched_attr. ] Signed-off-by: Juri Lelli [ Removed sched_setscheduler2() for now. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/syscalls/syscall_32.tbl | 2 ++ arch/x86/syscalls/syscall_64.tbl | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index aabfb83..96bc506 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -357,3 +357,5 @@ 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 349 i386 kcmp sys_kcmp 350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 38ae65d..a12bddc 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -320,6 +320,8 @@ 311 64 process_vm_writev sys_process_vm_writev 312 common kcmp sys_kcmp 313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr # # x32-specific system call numbers start at 512 to avoid cache impact -- cgit v1.1 From 5dd12c2152743747ca9f50ef80281e54cc416dc0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 18:04:39 +0100 Subject: sched/clock, x86: Use mul_u64_u32_shr() for native_sched_clock() Use mul_u64_u32_shr() so that x86_64 can use a single 64x64->128 mul. Before: 0000000000000560 : 560: 44 8b 1d 00 00 00 00 mov 0x0(%rip),%r11d # 567 567: 55 push %rbp 568: 48 89 e5 mov %rsp,%rbp 56b: 45 85 db test %r11d,%r11d 56e: 75 4f jne 5bf 570: 0f 31 rdtsc 572: 89 c0 mov %eax,%eax 574: 48 c1 e2 20 shl $0x20,%rdx 578: 48 c7 c1 00 00 00 00 mov $0x0,%rcx 57f: 48 09 c2 or %rax,%rdx 582: 48 c7 c7 00 00 00 00 mov $0x0,%rdi 589: 65 8b 04 25 00 00 00 mov %gs:0x0,%eax 590: 00 591: 48 98 cltq 593: 48 8b 34 c5 00 00 00 mov 0x0(,%rax,8),%rsi 59a: 00 59b: 48 89 d0 mov %rdx,%rax 59e: 81 e2 ff 03 00 00 and $0x3ff,%edx 5a4: 48 c1 e8 0a shr $0xa,%rax 5a8: 48 0f af 14 0e imul (%rsi,%rcx,1),%rdx 5ad: 48 0f af 04 0e imul (%rsi,%rcx,1),%rax 5b2: 5d pop %rbp 5b3: 48 03 04 3e add (%rsi,%rdi,1),%rax 5b7: 48 c1 ea 0a shr $0xa,%rdx 5bb: 48 01 d0 add %rdx,%rax 5be: c3 retq After: 0000000000000550 : 550: 8b 3d 00 00 00 00 mov 0x0(%rip),%edi # 556 556: 55 push %rbp 557: 48 89 e5 mov %rsp,%rbp 55a: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 55e: 85 ff test %edi,%edi 560: 75 2c jne 58e 562: 0f 31 rdtsc 564: 89 c0 mov %eax,%eax 566: 48 c1 e2 20 shl $0x20,%rdx 56a: 48 09 c2 or %rax,%rdx 56d: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax 574: 00 00 576: 89 c0 mov %eax,%eax 578: 48 f7 e2 mul %rdx 57b: 65 48 8b 0c 25 00 00 mov %gs:0x0,%rcx 582: 00 00 584: c9 leaveq 585: 48 0f ac d0 0a shrd $0xa,%rdx,%rax 58a: 48 01 c8 add %rcx,%rax 58d: c3 retq MAINLINE POST sched_clock_stable: 1 1 (cold) sched_clock: 329841 331312 (cold) local_clock: 301773 310296 (warm) sched_clock: 38375 38247 (warm) local_clock: 100371 102713 (warm) rdtsc: 27340 27289 sched_clock_stable: 0 0 (cold) sched_clock: 382634 372706 (cold) local_clock: 396890 399275 (warm) sched_clock: 38194 38124 (warm) local_clock: 143452 148698 (warm) rdtsc: 27345 27365 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-piu203ses5y1g36bnyw2n16x@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0e..10a78c0 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -4,6 +4,7 @@ #include #include #include +#include #define TICK_SIZE (tick_nsec / 1000) @@ -57,10 +58,8 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - int cpu = smp_processor_id(); - unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), - (1UL << CYC2NS_SCALE_FACTOR)); + unsigned long long ns = this_cpu_read(cyc2ns_offset); + ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); return ns; } -- cgit v1.1 From 57c67da274f3fab38e08d2c9edf08b89e1d9c71d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:39:25 +0100 Subject: sched/clock, x86: Move some cyc2ns() code around There are no __cycles_2_ns() users outside of arch/x86/kernel/tsc.c, so move it there. There are no cycles_2_ns() users. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-01lslnavfgo3kmbo4532zlcj@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 59 ----------------------- arch/x86/kernel/tsc.c | 112 +++++++++++++++++++++++-------------------- 2 files changed, 61 insertions(+), 110 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 10a78c0..b4c6676 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,66 +13,7 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - * - * In: - * - * ns = cycles * cyc2ns_scale / SC - * - * Although we may still have enough bits to store the value of ns, - * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, - * leading to an incorrect result. - * - * To avoid this, we can decompose 'cycles' into quotient and remainder - * of division by SC. Then, - * - * ns = (quot * SC + rem) * cyc2ns_scale / SC - * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC - * - * - sqazi@google.com - */ - DECLARE_PER_CPU(unsigned long, cyc2ns); DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline unsigned long long __cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns = this_cpu_read(cyc2ns_offset); - ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); - return ns; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = __cycles_2_ns(cyc); - local_irq_restore(flags); - - return ns; -} - #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d4..b4a04ac 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -38,6 +38,66 @@ static int __read_mostly tsc_unstable; static int __read_mostly tsc_disabled = -1; int tsc_clocksource_reliable; + +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns = this_cpu_read(cyc2ns_offset); + ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); + return ns; +} + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now, *offset; + unsigned long flags, *scale; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + offset = &per_cpu(cyc2ns_offset, cpu); + + rdtscll(tsc_now); + ns_now = cycles_2_ns(tsc_now); + + if (cpu_khz) { + *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + + cpu_khz / 2) / cpu_khz; + *offset = ns_now - mult_frac(tsc_now, *scale, + (1UL << CYC2NS_SCALE_FACTOR)); + } + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -62,7 +122,7 @@ u64 native_sched_clock(void) rdtscll(this_offset); /* return the value in ns */ - return __cycles_2_ns(this_offset); + return cycles_2_ns(this_offset); } /* We need to define a real function for sched_clock, to override the @@ -589,56 +649,6 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) -- cgit v1.1 From c7a730fa4624092e2d1c0cb7b750816e87c32364 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 13 Jan 2014 08:40:20 -0500 Subject: x86/irq: Fix kbuild warning in smp_irq_move_cleanup_interrupt() Fengguang Wu's 0day kernel build service reported the following build warning: arch/x86/kernel/apic/io_apic.c:2211 smp_irq_move_cleanup_interrupt() warn: always true condition '(irq <= -1) => (0-u32max <= (-1))' because irq is defined as an unsigned int instead of an int. Fix this trivial error by redefining irq as a signed int. The remaining consumers of the int are okay. Signed-off-by: Prarit Bhargava Cc: Konrad Rzeszutek Wilk Cc: Sebastian Andrzej Siewior Cc: Joerg Roedel Cc: Fengguang Wu Link: http://lkml.kernel.org/r/1389620420-7110-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6df0b66..a43f068 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2202,7 +2202,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; + int irq; unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; -- cgit v1.1 From 20d1c86a57762f0a33a78988e3fc8818316badd4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:40:29 +0100 Subject: sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs Use a ring-buffer like multi-version object structure which allows always having a coherent object; we use this to avoid having to disable IRQs while reading sched_clock() and avoids a problem when getting an NMI while changing the cyc2ns data. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 331312 257223 (cold) local_clock: 301773 310296 309889 (warm) sched_clock: 38375 38247 25280 (warm) local_clock: 100371 102713 85268 (warm) rdtsc: 27340 27289 24247 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 372706 301224 (cold) local_clock: 396890 399275 399870 (warm) sched_clock: 38194 38124 25630 (warm) local_clock: 143452 148698 129629 (warm) rdtsc: 27345 27365 24307 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-s567in1e5ekq2nlyhn8f987r@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 23 +++- arch/x86/kernel/cpu/perf_event.c | 14 ++- arch/x86/kernel/tsc.c | 229 +++++++++++++++++++++++++++++++++++---- arch/x86/platform/uv/tlb_uv.c | 66 ++++++----- 4 files changed, 276 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index b4c6676..3de54ef 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,7 +13,26 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -DECLARE_PER_CPU(unsigned long, cyc2ns); -DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); +/* + * We use the full linear equation: f(x) = a + b*x, in order to allow + * a continuous function in the face of dynamic freq changes. + * + * Continuity means that when our frequency changes our slope (b); we want to + * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. + * + * Without an offset (a) the above would not be possible. + * + * See the comment near cycles_2_ns() for details on how we compute (b). + */ +struct cyc2ns_data { + u32 cyc2ns_mul; + u32 cyc2ns_shift; + u64 cyc2ns_offset; + u32 __count; + /* u32 hole */ +}; /* 24 bytes -- do not grow */ + +extern struct cyc2ns_data *cyc2ns_read_begin(void); +extern void cyc2ns_read_end(struct cyc2ns_data *); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e13293..9f97bd0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,6 +1883,8 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + struct cyc2ns_data *data; + userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; @@ -1891,13 +1893,17 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!sched_clock_stable) return; + data = cyc2ns_read_begin(); + userpg->cap_user_time = 1; - userpg->time_mult = this_cpu_read(cyc2ns); - userpg->time_shift = CYC2NS_SCALE_FACTOR; - userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; userpg->cap_user_time_zero = 1; - userpg->time_zero = this_cpu_read(cyc2ns_offset); + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); } /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b4a04ac..92b090b 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -39,7 +39,119 @@ static int __read_mostly tsc_disabled = -1; int tsc_clocksource_reliable; -/* Accelerators for sched_clock() +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ + struct cyc2ns_data *head; /* 48 + 8 = 56 */ + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ + struct cyc2ns_data *head; + + preempt_disable(); + + head = this_cpu_read(cyc2ns.head); + /* + * Ensure we observe the entry when we observe the pointer to it. + * matches the wmb from cyc2ns_write_end(). + */ + smp_read_barrier_depends(); + head->__count++; + barrier(); + + return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ + barrier(); + /* + * If we're the outer most nested read; update the tail pointer + * when we're done. This notifies possible pending writers + * that we've observed the head pointer and that the other + * entry is now free. + */ + if (!--head->__count) { + /* + * x86-TSO does not reorder writes with older reads; + * therefore once this write becomes visible to another + * cpu, we must be finished reading the cyc2ns_data. + * + * matches with cyc2ns_write_begin(). + */ + this_cpu_write(cyc2ns.tail, head); + } + preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + struct cyc2ns_data *data = c2n->data; + + if (data == c2n->head) + data++; + + /* XXX send an IPI to @cpu in order to guarantee a read? */ + + /* + * When we observe the tail write from cyc2ns_read_end(), + * the cpu must be done with that entry and its safe + * to start writing to it. + */ + while (c2n->tail == data) + cpu_relax(); + + return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + /* + * Ensure the @data writes are visible before we publish the + * entry. Matches the data-depencency in cyc2ns_read_begin(). + */ + smp_wmb(); + + ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) @@ -61,49 +173,106 @@ int tsc_clocksource_reliable; * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ + data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = 0; + data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + cyc2ns_data_init(&c2n->data[0]); + cyc2ns_data_init(&c2n->data[1]); + + c2n->head = c2n->data; + c2n->tail = c2n->data; +} + static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - unsigned long long ns = this_cpu_read(cyc2ns_offset); - ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); + struct cyc2ns_data *data, *tail; + unsigned long long ns; + + /* + * See cyc2ns_read_*() for details; replicated in order to avoid + * an extra few instructions that came with the abstraction. + * Notable, it allows us to only do the __count and tail update + * dance when its actually needed. + */ + + preempt_disable(); + data = this_cpu_read(cyc2ns.head); + tail = this_cpu_read(cyc2ns.tail); + + if (likely(data == tail)) { + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + } else { + data->__count++; + + barrier(); + + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + barrier(); + + if (!--data->__count) + this_cpu_write(cyc2ns.tail, data); + } + preempt_enable(); + return ns; } +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; + unsigned long long tsc_now, ns_now; + struct cyc2ns_data *data; + unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); + if (!cpu_khz) + goto done; + + data = cyc2ns_write_begin(cpu); rdtscll(tsc_now); ns_now = cycles_2_ns(tsc_now); - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + cyc2ns_write_end(cpu, data); +done: sched_clock_idle_wakeup_event(0); local_irq_restore(flags); } - /* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) { - u64 this_offset; + u64 tsc_now; /* * Fall back to jiffies if there's no TSC available: @@ -119,10 +288,10 @@ u64 native_sched_clock(void) } /* read the Time Stamp Counter: */ - rdtscll(this_offset); + rdtscll(tsc_now); /* return the value in ns */ - return cycles_2_ns(this_offset); + return cycles_2_ns(tsc_now); } /* We need to define a real function for sched_clock, to override the @@ -678,11 +847,21 @@ void tsc_restore_sched_clock_state(void) local_irq_save(flags); - __this_cpu_write(cyc2ns_offset, 0); + /* + * We're comming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + offset = cyc2ns_suspend - sched_clock(); - for_each_possible_cpu(cpu) - per_cpu(cyc2ns_offset, cpu) = offset; + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } local_irq_restore(flags); } @@ -1005,8 +1184,10 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + cyc2ns_init(cpu); set_cyc2ns_scale(cpu_khz, cpu); + } if (tsc_disabled > 0) return; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index efe4d72..dfe605a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) return; } -static inline unsigned long cycles_2_us(unsigned long long cyc) +/* + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative + * number, not an absolute. It converts a duration in cycles to a duration in + * ns. + */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc) { + struct cyc2ns_data *data = cyc2ns_read_begin(); unsigned long long ns; - unsigned long us; - int cpu = smp_processor_id(); - ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; + ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + + cyc2ns_read_end(data); + return ns; +} + +/* + * The reverse of the above; converts a duration in ns to a duration in cycles. + */ +static inline unsigned long long ns_2_cycles(unsigned long long ns) +{ + struct cyc2ns_data *data = cyc2ns_read_begin(); + unsigned long long cyc; + + cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + + cyc2ns_read_end(data); + return cyc; +} + +static inline unsigned long cycles_2_us(unsigned long long cyc) +{ + return cycles_2_ns(cyc) / NSEC_PER_USEC; +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + return ns_2_cycles(sec * NSEC_PER_SEC); +} + +static inline unsigned long long usec_2_cycles(unsigned long usec) +{ + return ns_2_cycles(usec * NSEC_PER_USEC); } /* @@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc, bcp, try); } -static inline cycles_t sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Our retries are blocked by all destination sw ack resources being * in use, and a timeout is pending. In that case hardware immediately @@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data) { } -static inline unsigned long long usec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Display the statistics thru /proc/sgi_uv/ptc_statistics * 'data' points to the cpu number -- cgit v1.1 From 35af99e646c7f7ea46dc2977601e9e71a51dadd5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2013 19:38:42 +0100 Subject: sched/clock, x86: Use a static_key for sched_clock_stable In order to avoid the runtime condition and variable load turn sched_clock_stable into a static_key. Also provide a shorter implementation of local_clock() and cpu_clock(int) when sched_clock_stable==1. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 221876 215295 (cold) local_clock: 301773 234692 220773 (warm) sched_clock: 38375 25602 25659 (warm) local_clock: 100371 33265 27242 (warm) rdtsc: 27340 24214 24208 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 235941 237019 (cold) local_clock: 396890 297017 294819 (warm) sched_clock: 38194 25233 25609 (warm) local_clock: 143452 71234 71232 (warm) rdtsc: 27345 24245 24243 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-eummbdechzz37mwmpags1gjr@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/tsc.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023b..8bc79cd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ea04b34..1a439c0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9f97bd0..b886451 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1890,7 +1890,7 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; data = cyc2ns_read_begin(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 92b090b..53c1235 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -822,7 +822,7 @@ static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable) + if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -842,7 +842,7 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; local_irq_save(flags); @@ -984,7 +984,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - sched_clock_stable = 0; + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ -- cgit v1.1 From 10b033d434c25a6c9e0f4f4dc2418af1b8236c63 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2013 19:01:40 +0100 Subject: sched/clock, x86: Avoid a runtime condition in native_sched_clock() Use a static_key to avoid touching tsc_disabled and a runtime condition in native_sched_clock() -- less cachelines touched is always better. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 215295 213039 (cold) local_clock: 301773 220773 216084 (warm) sched_clock: 38375 25659 25231 (warm) local_clock: 100371 27242 27601 (warm) rdtsc: 27340 24208 24203 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 237019 240055 (cold) local_clock: 396890 294819 299942 (warm) sched_clock: 38194 25609 25276 (warm) local_clock: 143452 71232 73232 (warm) rdtsc: 27345 24243 24244 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-hrz87bo37qke25bty6pnfy4b@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 53c1235..6377fb2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -37,6 +38,8 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; +static struct static_key __use_tsc = STATIC_KEY_INIT; + int tsc_clocksource_reliable; /* @@ -282,7 +285,7 @@ u64 native_sched_clock(void) * very important for it to be as fast as the platform * can achieve it. ) */ - if (unlikely(tsc_disabled)) { + if (!static_key_false(&__use_tsc)) { /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } @@ -1193,7 +1196,9 @@ void __init tsc_init(void) return; /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + static_key_slow_inc(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); -- cgit v1.1 From 8cb75e0c4ec9786b81439761eac1d18d4a931af3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Nov 2013 12:22:37 +0100 Subject: sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding With various drivers wanting to inject idle time; we get people calling idle routines outside of the idle loop proper. Therefore we need to be extra careful about not missing TIF_NEED_RESCHED -> PREEMPT_NEED_RESCHED propagations. While looking at this, I also realized there's a small window in the existing idle loop where we can miss TIF_NEED_RESCHED; when it hits right after the tif_need_resched() test at the end of the loop but right before the need_resched() test at the start of the loop. So move preempt_fold_need_resched() out of the loop where we're guaranteed to have TIF_NEED_RESCHED set. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-x9jgh45oeayzajz2mjt0y7d6@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mwait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 19b71c4..1da25a5 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -53,7 +53,7 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) if (!need_resched()) __mwait(eax, ecx); } - __current_clr_polling(); + current_clr_polling(); } #endif /* _ASM_X86_MWAIT_H */ -- cgit v1.1 From 5aa3d718f259007121b9366d36315fb8a2983d3d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Dec 2013 20:50:42 +0100 Subject: x86, ramdisk: Export relocated ramdisk VA The ramdisk can possibly get relocated if the whole image is not mapped. And since we're going over it in the microcode loader and fishing out the relevant microcode patches, we want access it at its new location. Thus, export it. Signed-off-by: Borislav Petkov Tested-by: Aravind Gopalakrishnan --- arch/x86/include/asm/setup.h | 3 ++- arch/x86/kernel/setup.c | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 59bcf4e..d62c9f8 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -3,7 +3,6 @@ #include - #define COMMAND_LINE_SIZE 2048 #include @@ -29,6 +28,8 @@ #include #include +extern u64 relocated_ramdisk; + /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 void vsmp_init(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cb233bc..baefc6d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -295,6 +295,8 @@ static void __init reserve_brk(void) _brk_start = 0; } +u64 relocated_ramdisk; + #ifdef CONFIG_BLK_DEV_INITRD static u64 __init get_ramdisk_image(void) @@ -321,25 +323,24 @@ static void __init relocate_initrd(void) u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; /* We need to move the initrd down into directly mapped mem */ - ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - area_size, PAGE_SIZE); + relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + area_size, PAGE_SIZE); - if (!ramdisk_here) + if (!relocated_ramdisk) panic("Cannot find place for new RAMDISK of size %lld\n", - ramdisk_size); + ramdisk_size); /* Note: this includes all the mem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - memblock_reserve(ramdisk_here, area_size); - initrd_start = ramdisk_here + PAGE_OFFSET; + memblock_reserve(relocated_ramdisk, area_size); + initrd_start = relocated_ramdisk + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", - ramdisk_here, ramdisk_here + ramdisk_size - 1); + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); q = (char *)initrd_start; @@ -363,7 +364,7 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, - ramdisk_here, ramdisk_here + ramdisk_size - 1); + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); } static void __init early_reserve_initrd(void) -- cgit v1.1 From e1b43e3f13f7157249fb962ccf88b84eb0421fb4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Dec 2013 12:31:31 +0100 Subject: x86, microcode: Share native MSR accessing variants We want to use those in AMD's early loading path too. Also, add a native_wrmsrl variant. Signed-off-by: Borislav Petkov Tested-by: Aravind Gopalakrishnan --- arch/x86/include/asm/microcode.h | 15 +++++++++++++++ arch/x86/kernel/microcode_intel_early.c | 10 ---------- 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index f98bd66..b59827e 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -1,6 +1,21 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H +#define native_rdmsr(msr, val1, val2) \ +do { \ + u64 __val = native_read_msr((msr)); \ + (void)((val1) = (u32)__val); \ + (void)((val2) = (u32)(__val >> 32)); \ +} while (0) + +#define native_wrmsr(msr, low, high) \ + native_write_msr(msr, low, high) + +#define native_wrmsrl(msr, val) \ + native_write_msr((msr), \ + (u32)((u64)(val)), \ + (u32)((u64)(val) >> 32)) + struct cpu_signature { unsigned int sig; unsigned int pf; diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c index 1575deb..18f7391 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/microcode_intel_early.c @@ -365,16 +365,6 @@ out: return state; } -#define native_rdmsr(msr, val1, val2) \ -do { \ - u64 __val = native_read_msr((msr)); \ - (void)((val1) = (u32)__val); \ - (void)((val2) = (u32)(__val >> 32)); \ -} while (0) - -#define native_wrmsr(msr, low, high) \ - native_write_msr(msr, low, high); - static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; -- cgit v1.1 From 5335ba5cf475369f88db8e6835764efdcad8ab96 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 29 Nov 2013 14:58:44 +0100 Subject: x86, microcode, AMD: Fix early ucode loading The original idea to use the microcode cache for the APs doesn't pan out because we do memory allocation there very early and with IRQs disabled and we don't want to involve GFP_ATOMIC allocations. Not if it can be helped. Thus, extend the caching of the BSP patch approach to the APs and iterate over the ucode in the initrd instead of using the cache. We still save the relevant patches to it but later, right before we jettison the initrd. While at it, fix early ucode loading on 32-bit too. Signed-off-by: Borislav Petkov Tested-by: Aravind Gopalakrishnan --- arch/x86/include/asm/microcode_amd.h | 7 +- arch/x86/kernel/microcode_amd.c | 13 +- arch/x86/kernel/microcode_amd_early.c | 239 ++++++++++++++++++++++------------ 3 files changed, 170 insertions(+), 89 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 4c01917..b7b10b8 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -61,11 +61,10 @@ extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); +#define PATCH_MAX_SIZE PAGE_SIZE +extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; + #ifdef CONFIG_MICROCODE_AMD_EARLY -#ifdef CONFIG_X86_32 -#define MPB_MAX_SIZE PAGE_SIZE -extern u8 amd_bsp_mpb[MPB_MAX_SIZE]; -#endif extern void __init load_ucode_amd_bsp(void); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c3d4cc9..4a6ff74 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd) { u32 rev, dummy; - wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* verify patch application was successful */ - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev != mc_amd->hdr.patch_id) return -1; @@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; + pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + __func__, patch->patch_id, proc_id); + /* ... and add to cache. */ update_cache(patch); @@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { struct ucode_patch *p = find_patch(smp_processor_id()); if (p) { - memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); - memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), - MPB_MAX_SIZE)); + memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); + memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), + PATCH_MAX_SIZE)); } } #endif diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c index 6073104..8384c0f 100644 --- a/arch/x86/kernel/microcode_amd_early.c +++ b/arch/x86/kernel/microcode_amd_early.c @@ -2,6 +2,7 @@ * Copyright (C) 2013 Advanced Micro Devices, Inc. * * Author: Jacob Shin + * Fixes: Borislav Petkov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -15,10 +16,18 @@ #include #include -static bool ucode_loaded; +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; + static u32 ucode_new_rev; -static unsigned long ucode_offset; -static size_t ucode_size; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +struct cpio_data ucode_cpio; /* * Microcode patch container file is prepended to the initrd in cpio format. @@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void) char *path; void *start; size_t size; - unsigned long *uoffset; - size_t *usize; - struct cpio_data cd; #ifdef CONFIG_X86_32 struct boot_params *p; @@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void) path = (char *)__pa_nodebug(ucode_path); start = (void *)p->hdr.ramdisk_image; size = p->hdr.ramdisk_size; - uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); - usize = (size_t *)__pa_nodebug(&ucode_size); #else path = ucode_path; start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); size = boot_params.hdr.ramdisk_size; - uoffset = &ucode_offset; - usize = &ucode_size; #endif - cd = find_cpio_data(path, start, size, &offset); - if (!cd.data) - return cd; + return find_cpio_data(path, start, size, &offset); +} - if (*(u32 *)cd.data != UCODE_MAGIC) { - cd.data = NULL; - cd.size = 0; - return cd; - } +static size_t compute_container_size(u8 *data, u32 total_size) +{ + size_t size = 0; + u32 *header = (u32 *)data; - *uoffset = (u8 *)cd.data - (u8 *)start; - *usize = cd.size; + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return size; - return cd; + size = header[2] + CONTAINER_HDR_SZ; + total_size -= size; + data += size; + + while (total_size) { + u16 patch_size; + + header = (u32 *)data; + + if (header[0] != UCODE_UCODE_TYPE) + break; + + /* + * Sanity-check patch size. + */ + patch_size = header[1]; + if (patch_size > PATCH_MAX_SIZE) + break; + + size += patch_size + SECTION_HDR_SIZE; + data += patch_size + SECTION_HDR_SIZE; + total_size -= patch_size + SECTION_HDR_SIZE; + } + + return size; } /* @@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void) static void apply_ucode_in_initrd(void *ucode, size_t size) { struct equiv_cpu_entry *eq; + size_t *cont_sz; u32 *header; - u8 *data; + u8 *data, **cont; u16 eq_id = 0; int offset, left; - u32 rev, eax; + u32 rev, eax, ebx, ecx, edx; u32 *new_rev; - unsigned long *uoffset; - size_t *usize; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); - usize = (size_t *)__pa_nodebug(&ucode_size); + cont_sz = (size_t *)__pa_nodebug(&container_size); + cont = (u8 **)__pa_nodebug(&container); #else new_rev = &ucode_new_rev; - uoffset = &ucode_offset; - usize = &ucode_size; + cont_sz = &container_size; + cont = &container; #endif data = ucode; @@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) header = (u32 *)data; /* find equiv cpu table */ - - if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ header[2] == 0) /* size */ return; - eax = cpuid_eax(0x00000001); + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); while (left > 0) { eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + *cont = data; + + /* Advance past the container header */ offset = header[2] + CONTAINER_HDR_SZ; data += offset; left -= offset; eq_id = find_equiv_id(eq, eax); - if (eq_id) + if (eq_id) { + this_equiv_id = eq_id; + *cont_sz = compute_container_size(*cont, left + offset); + + /* + * truncate how much we need to iterate over in the + * ucode update loop below + */ + left = *cont_sz - offset; break; + } /* * support multiple container files appended together. if this @@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) /* mark where the next microcode container file starts */ offset = data - (u8 *)ucode; - *uoffset += offset; - *usize -= offset; ucode = data; } if (!eq_id) { - *usize = 0; + *cont = NULL; + *cont_sz = 0; return; } /* find ucode and update if needed */ - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); while (left > 0) { struct microcode_amd *mc; @@ -168,73 +206,83 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) break; mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); - if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) - if (__apply_microcode_amd(mc) == 0) { + + if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + + if (!__apply_microcode_amd(mc)) { rev = mc->hdr.patch_id; *new_rev = rev; + + /* save ucode patch */ + memcpy(amd_ucode_patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); } + } offset = header[1] + SECTION_HDR_SIZE; data += offset; left -= offset; } - - /* mark where this microcode container file ends */ - offset = *usize - (data - (u8 *)ucode); - *usize -= offset; - - if (!(*new_rev)) - *usize = 0; } void __init load_ucode_amd_bsp(void) { - struct cpio_data cd = find_ucode_in_initrd(); - if (!cd.data) + struct cpio_data cp; + void **data; + size_t *size; + +#ifdef CONFIG_X86_32 + data = (void **)__pa_nodebug(&ucode_cpio.data); + size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else + data = &ucode_cpio.data; + size = &ucode_cpio.size; +#endif + + cp = find_ucode_in_initrd(); + if (!cp.data) return; - apply_ucode_in_initrd(cd.data, cd.size); + *data = cp.data; + *size = cp.size; + + apply_ucode_in_initrd(cp.data, cp.size); } #ifdef CONFIG_X86_32 -u8 amd_bsp_mpb[MPB_MAX_SIZE]; - /* * On 32-bit, since AP's early load occurs before paging is turned on, we * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which - * is used upon resume from suspend. + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend. */ void load_ucode_amd_ap(void) { struct microcode_amd *mc; - unsigned long *initrd; - unsigned long *uoffset; size_t *usize; - void *ucode; + void **ucode; - mc = (struct microcode_amd *)__pa(amd_bsp_mpb); + mc = (struct microcode_amd *)__pa(amd_ucode_patch); if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { __apply_microcode_amd(mc); return; } - initrd = (unsigned long *)__pa(&initrd_start); - uoffset = (unsigned long *)__pa(&ucode_offset); - usize = (size_t *)__pa(&ucode_size); + ucode = (void *)__pa_nodebug(&container); + usize = (size_t *)__pa_nodebug(&container_size); - if (!*usize || !*initrd) + if (!*ucode || !*usize) return; - ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset); - apply_ucode_in_initrd(ucode, *usize); + apply_ucode_in_initrd(*ucode, *usize); } static void __init collect_cpu_sig_on_bsp(void *arg) { unsigned int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + uci->cpu_sig.sig = cpuid_eax(0x00000001); } #else @@ -242,36 +290,54 @@ void load_ucode_amd_ap(void) { unsigned int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct equiv_cpu_entry *eq; + struct microcode_amd *mc; u32 rev, eax; + u16 eq_id; + + /* Exit if called on the BSP. */ + if (!cpu) + return; + + if (!container) + return; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); - eax = cpuid_eax(0x00000001); uci->cpu_sig.rev = rev; uci->cpu_sig.sig = eax; - if (cpu && !ucode_loaded) { - void *ucode; + eax = cpuid_eax(0x00000001); + eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); - if (!ucode_size || !initrd_start) - return; + eq_id = find_equiv_id(eq, eax); + if (!eq_id) + return; + + if (eq_id == this_equiv_id) { + mc = (struct microcode_amd *)amd_ucode_patch; - ucode = (void *)(initrd_start + ucode_offset); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) + ucode_new_rev = mc->hdr.patch_id; + } + + } else { + if (!ucode_cpio.data) return; - ucode_loaded = true; + /* + * AP has a different equivalence ID than BSP, looks like + * mixed-steppings silicon so go through the ucode blob anew. + */ + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); } - - apply_microcode_amd(cpu); } #endif int __init save_microcode_in_initrd_amd(void) { enum ucode_state ret; - void *ucode; u32 eax; #ifdef CONFIG_X86_32 @@ -280,22 +346,35 @@ int __init save_microcode_in_initrd_amd(void) if (!uci->cpu_sig.sig) smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); + + /* + * Take into account the fact that the ramdisk might get relocated + * and therefore we need to recompute the container's position in + * virtual memory space. + */ + container = (u8 *)(__va((u32)relocated_ramdisk) + + ((u32)container - boot_params.hdr.ramdisk_image)); #endif if (ucode_new_rev) pr_info("microcode: updated early to new patch_level=0x%08x\n", ucode_new_rev); - if (ucode_loaded || !ucode_size || !initrd_start) - return 0; + if (!container) + return -EINVAL; - ucode = (void *)(initrd_start + ucode_offset); eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - ret = load_microcode_amd(eax, ucode, ucode_size); + ret = load_microcode_amd(eax, container, container_size); if (ret != UCODE_OK) return -EINVAL; - ucode_loaded = true; + /* + * This will be freed any msec now, stash patches for the current + * family and switch to patch cache for cpu hotplug, etc later. + */ + container = NULL; + container_size = 0; + return 0; } -- cgit v1.1 From bad5fa631fca5466401cd4a48e30cc1f1cb6101e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 1 Dec 2013 18:09:58 +0100 Subject: x86, microcode: Move to a proper location We've grown a bunch of microcode loader files all prefixed with "microcode_". They should be under cpu/ because this is strictly CPU-related functionality so do that and drop the prefix since they're in their own directory now which gives that prefix. :) While at it, drop MICROCODE_INTEL_LIB config item and stash the functionality under CONFIG_MICROCODE_INTEL as it was its only user. Signed-off-by: Borislav Petkov Tested-by: Aravind Gopalakrishnan --- arch/x86/Kconfig | 4 - arch/x86/kernel/Makefile | 9 - arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/microcode/Makefile | 7 + arch/x86/kernel/cpu/microcode/amd.c | 492 +++++++++++++++++ arch/x86/kernel/cpu/microcode/amd_early.c | 380 ++++++++++++++ arch/x86/kernel/cpu/microcode/core.c | 645 +++++++++++++++++++++++ arch/x86/kernel/cpu/microcode/core_early.c | 141 +++++ arch/x86/kernel/cpu/microcode/intel.c | 333 ++++++++++++ arch/x86/kernel/cpu/microcode/intel_early.c | 787 ++++++++++++++++++++++++++++ arch/x86/kernel/cpu/microcode/intel_lib.c | 174 ++++++ arch/x86/kernel/microcode_amd.c | 492 ----------------- arch/x86/kernel/microcode_amd_early.c | 380 -------------- arch/x86/kernel/microcode_core.c | 645 ----------------------- arch/x86/kernel/microcode_core_early.c | 141 ----- arch/x86/kernel/microcode_intel.c | 333 ------------ arch/x86/kernel/microcode_intel_early.c | 787 ---------------------------- arch/x86/kernel/microcode_intel_lib.c | 174 ------ 18 files changed, 2960 insertions(+), 2965 deletions(-) create mode 100644 arch/x86/kernel/cpu/microcode/Makefile create mode 100644 arch/x86/kernel/cpu/microcode/amd.c create mode 100644 arch/x86/kernel/cpu/microcode/amd_early.c create mode 100644 arch/x86/kernel/cpu/microcode/core.c create mode 100644 arch/x86/kernel/cpu/microcode/core_early.c create mode 100644 arch/x86/kernel/cpu/microcode/intel.c create mode 100644 arch/x86/kernel/cpu/microcode/intel_early.c create mode 100644 arch/x86/kernel/cpu/microcode/intel_lib.c delete mode 100644 arch/x86/kernel/microcode_amd.c delete mode 100644 arch/x86/kernel/microcode_amd_early.c delete mode 100644 arch/x86/kernel/microcode_core.c delete mode 100644 arch/x86/kernel/microcode_core_early.c delete mode 100644 arch/x86/kernel/microcode_intel.c delete mode 100644 arch/x86/kernel/microcode_intel_early.c delete mode 100644 arch/x86/kernel/microcode_intel_lib.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0952ecd..01bfb9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1080,10 +1080,6 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE -config MICROCODE_INTEL_LIB - def_bool y - depends on MICROCODE_INTEL - config MICROCODE_INTEL_EARLY def_bool n diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b0a34e..4d0094d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -91,15 +91,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o -obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o -obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o -obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o -microcode-y := microcode_core.o -microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o -microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o -obj-$(CONFIG_MICROCODE_AMD_EARLY) += microcode_amd_early.o -obj-$(CONFIG_MICROCODE) += microcode.o - obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7..0710eef 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -42,6 +42,7 @@ endif obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_MICROCODE) += microcode/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 0000000..285c854 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -0,0 +1,7 @@ +microcode-y := core.o +obj-$(CONFIG_MICROCODE) += microcode.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_AMD) += amd.o +obj-$(CONFIG_MICROCODE_EARLY) += core_early.o +obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o +obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c new file mode 100644 index 0000000..4a6ff74 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -0,0 +1,492 @@ +/* + * AMD CPU Microcode Update Driver for Linux + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * + * Author: Peter Oruba + * + * Based on work by: + * Tigran Aivazian + * + * Maintainers: + * Andreas Herrmann + * Borislav Petkov + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. + * + * Licensed under the terms of the GNU General Public + * License version 2. See file COPYING for details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba"); +MODULE_LICENSE("GPL v2"); + +static struct equiv_cpu_entry *equiv_cpu_table; + +struct ucode_patch { + struct list_head plist; + void *data; + u32 patch_id; + u16 equiv_cpu; +}; + +static LIST_HEAD(pcache); + +static u16 __find_equiv_id(unsigned int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig); +} + +static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) +{ + int i = 0; + + BUG_ON(!equiv_cpu_table); + + while (equiv_cpu_table[i].equiv_cpu != 0) { + if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) + return equiv_cpu_table[i].installed_cpu; + i++; + } + return 0; +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) + if (p->equiv_cpu == equiv_cpu) + return p; + return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) { + if (p->equiv_cpu == new_patch->equiv_cpu) { + if (p->patch_id >= new_patch->patch_id) + /* we already have the latest patch */ + return; + + list_replace(&p->plist, &new_patch->plist); + kfree(p->data); + kfree(p); + return; + } + } + /* no patch found, add it */ + list_add_tail(&new_patch->plist, &pcache); +} + +static void free_cache(void) +{ + struct ucode_patch *p, *tmp; + + list_for_each_entry_safe(p, tmp, &pcache, plist) { + __list_del(p->plist.prev, p->plist.next); + kfree(p->data); + kfree(p); + } +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ + u16 equiv_id; + + equiv_id = __find_equiv_id(cpu); + if (!equiv_id) + return NULL; + + return cache_find_patch(equiv_id); +} + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct ucode_patch *p; + + csig->sig = cpuid_eax(0x00000001); + csig->rev = c->microcode; + + /* + * a patch could have been loaded early, set uci->mc so that + * mc_bp_resume() can call apply_microcode() + */ + p = find_patch(cpu); + if (p && (p->patch_id == csig->rev)) + uci->mc = p->data; + + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + + return 0; +} + +static unsigned int verify_patch_size(u8 family, u32 patch_size, + unsigned int size) +{ + u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 + + switch (family) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + if (patch_size > min_t(u32, size, max_size)) { + pr_err("patch size mismatch\n"); + return 0; + } + + return patch_size; +} + +int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc_amd->hdr.patch_id) + return -1; + + return 0; +} + +int apply_microcode_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_amd *mc_amd; + struct ucode_cpu_info *uci; + struct ucode_patch *p; + u32 rev, dummy; + + BUG_ON(raw_smp_processor_id() != cpu); + + uci = ucode_cpu_info + cpu; + + p = find_patch(cpu); + if (!p) + return 0; + + mc_amd = p->data; + uci->mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* need to apply patch? */ + if (rev >= mc_amd->hdr.patch_id) { + c->microcode = rev; + uci->cpu_sig.rev = rev; + return 0; + } + + if (__apply_microcode_amd(mc_amd)) { + pr_err("CPU%d: update failed for patch_level=0x%08x\n", + cpu, mc_amd->hdr.patch_id); + return -1; + } + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, + mc_amd->hdr.patch_id); + + uci->cpu_sig.rev = mc_amd->hdr.patch_id; + c->microcode = mc_amd->hdr.patch_id; + + return 0; +} + +static int install_equiv_cpu_table(const u8 *buf) +{ + unsigned int *ibuf = (unsigned int *)buf; + unsigned int type = ibuf[1]; + unsigned int size = ibuf[2]; + + if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + pr_err("empty section/" + "invalid type field in container file section header\n"); + return -EINVAL; + } + + equiv_cpu_table = vmalloc(size); + if (!equiv_cpu_table) { + pr_err("failed to allocate equivalent CPU table\n"); + return -ENOMEM; + } + + memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); + + /* add header length */ + return size + CONTAINER_HDR_SZ; +} + +static void free_equiv_cpu_table(void) +{ + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; +} + +static void cleanup(void) +{ + free_equiv_cpu_table(); + free_cache(); +} + +/* + * We return the current size even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) +{ + struct microcode_header_amd *mc_hdr; + struct ucode_patch *patch; + unsigned int patch_size, crnt_size, ret; + u32 proc_fam; + u16 proc_id; + + patch_size = *(u32 *)(fw + 4); + crnt_size = patch_size + SECTION_HDR_SIZE; + mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); + proc_id = mc_hdr->processor_rev_id; + + proc_fam = find_cpu_family_by_equiv_cpu(proc_id); + if (!proc_fam) { + pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); + return crnt_size; + } + + /* check if patch is for the current family */ + proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); + if (proc_fam != family) + return crnt_size; + + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", + mc_hdr->patch_id); + return crnt_size; + } + + ret = verify_patch_size(family, patch_size, leftover); + if (!ret) { + pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); + return crnt_size; + } + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + pr_err("Patch allocation failure.\n"); + return -EINVAL; + } + + patch->data = kzalloc(patch_size, GFP_KERNEL); + if (!patch->data) { + pr_err("Patch data allocation failure.\n"); + kfree(patch); + return -EINVAL; + } + + /* All looks ok, copy patch... */ + memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); + INIT_LIST_HEAD(&patch->plist); + patch->patch_id = mc_hdr->patch_id; + patch->equiv_cpu = proc_id; + + pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + __func__, patch->patch_id, proc_id); + + /* ... and add to cache. */ + update_cache(patch); + + return crnt_size; +} + +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) +{ + enum ucode_state ret = UCODE_ERROR; + unsigned int leftover; + u8 *fw = (u8 *)data; + int crnt_size = 0; + int offset; + + offset = install_equiv_cpu_table(data); + if (offset < 0) { + pr_err("failed to create equivalent cpu table\n"); + return ret; + } + fw += offset; + leftover = size - offset; + + if (*(u32 *)fw != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + free_equiv_cpu_table(); + return ret; + } + + while (leftover) { + crnt_size = verify_and_add_patch(family, fw, leftover); + if (crnt_size < 0) + return ret; + + fw += crnt_size; + leftover -= crnt_size; + } + + return UCODE_OK; +} + +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + enum ucode_state ret; + + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = __load_microcode_amd(family, data, size); + + if (ret != UCODE_OK) + cleanup(); + +#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) + /* save BSP's matching patch for early load */ + if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(smp_processor_id()); + if (p) { + memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); + memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), + PATCH_MAX_SIZE)); + } + } +#endif + return ret; +} + +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Beginning with family 15h, they are in family-specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ +static enum ucode_state request_microcode_amd(int cpu, struct device *device, + bool refresh_fw) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &cpu_data(cpu); + enum ucode_state ret = UCODE_NFOUND; + const struct firmware *fw; + + /* reload ucode container only on the boot cpu */ + if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + return UCODE_OK; + + if (c->x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); + + if (request_firmware(&fw, (const char *)fw_name, device)) { + pr_debug("failed to load file %s\n", fw_name); + goto out; + } + + ret = UCODE_ERROR; + if (*(u32 *)fw->data != UCODE_MAGIC) { + pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); + goto fw_release; + } + + ret = load_microcode_amd(c->x86, fw->data, fw->size); + + fw_release: + release_firmware(fw); + + out: + return ret; +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + return UCODE_ERROR; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + return µcode_amd_ops; +} + +void __exit exit_amd_microcode(void) +{ + cleanup(); +} diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c new file mode 100644 index 0000000..8384c0f --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin + * Fixes: Borislav Petkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include +#include + +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; + +static u32 ucode_new_rev; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +struct cpio_data ucode_cpio; + +/* + * Microcode patch container file is prepended to the initrd in cpio format. + * See Documentation/x86/early-microcode.txt + */ +static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + +static struct cpio_data __init find_ucode_in_initrd(void) +{ + long offset = 0; + char *path; + void *start; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *p; + + /* + * On 32-bit, early load occurs before paging is turned on so we need + * to use physical addresses. + */ + p = (struct boot_params *)__pa_nodebug(&boot_params); + path = (char *)__pa_nodebug(ucode_path); + start = (void *)p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; +#else + path = ucode_path; + start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); + size = boot_params.hdr.ramdisk_size; +#endif + + return find_cpio_data(path, start, size, &offset); +} + +static size_t compute_container_size(u8 *data, u32 total_size) +{ + size_t size = 0; + u32 *header = (u32 *)data; + + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return size; + + size = header[2] + CONTAINER_HDR_SZ; + total_size -= size; + data += size; + + while (total_size) { + u16 patch_size; + + header = (u32 *)data; + + if (header[0] != UCODE_UCODE_TYPE) + break; + + /* + * Sanity-check patch size. + */ + patch_size = header[1]; + if (patch_size > PATCH_MAX_SIZE) + break; + + size += patch_size + SECTION_HDR_SIZE; + data += patch_size + SECTION_HDR_SIZE; + total_size -= patch_size + SECTION_HDR_SIZE; + } + + return size; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static void apply_ucode_in_initrd(void *ucode, size_t size) +{ + struct equiv_cpu_entry *eq; + size_t *cont_sz; + u32 *header; + u8 *data, **cont; + u16 eq_id = 0; + int offset, left; + u32 rev, eax, ebx, ecx, edx; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + cont_sz = (size_t *)__pa_nodebug(&container_size); + cont = (u8 **)__pa_nodebug(&container); +#else + new_rev = &ucode_new_rev; + cont_sz = &container_size; + cont = &container; +#endif + + data = ucode; + left = size; + header = (u32 *)data; + + /* find equiv cpu table */ + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return; + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + while (left > 0) { + eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + + *cont = data; + + /* Advance past the container header */ + offset = header[2] + CONTAINER_HDR_SZ; + data += offset; + left -= offset; + + eq_id = find_equiv_id(eq, eax); + if (eq_id) { + this_equiv_id = eq_id; + *cont_sz = compute_container_size(*cont, left + offset); + + /* + * truncate how much we need to iterate over in the + * ucode update loop below + */ + left = *cont_sz - offset; + break; + } + + /* + * support multiple container files appended together. if this + * one does not have a matching equivalent cpu entry, we fast + * forward to the next container file. + */ + while (left > 0) { + header = (u32 *)data; + if (header[0] == UCODE_MAGIC && + header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) + break; + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } + + /* mark where the next microcode container file starts */ + offset = data - (u8 *)ucode; + ucode = data; + } + + if (!eq_id) { + *cont = NULL; + *cont_sz = 0; + return; + } + + /* find ucode and update if needed */ + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + while (left > 0) { + struct microcode_amd *mc; + + header = (u32 *)data; + if (header[0] != UCODE_UCODE_TYPE || /* type */ + header[1] == 0) /* size */ + break; + + mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + + if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + + if (!__apply_microcode_amd(mc)) { + rev = mc->hdr.patch_id; + *new_rev = rev; + + /* save ucode patch */ + memcpy(amd_ucode_patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); + } + } + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } +} + +void __init load_ucode_amd_bsp(void) +{ + struct cpio_data cp; + void **data; + size_t *size; + +#ifdef CONFIG_X86_32 + data = (void **)__pa_nodebug(&ucode_cpio.data); + size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else + data = &ucode_cpio.data; + size = &ucode_cpio.size; +#endif + + cp = find_ucode_in_initrd(); + if (!cp.data) + return; + + *data = cp.data; + *size = cp.size; + + apply_ucode_in_initrd(cp.data, cp.size); +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit, since AP's early load occurs before paging is turned on, we + * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during + * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend. + */ +void load_ucode_amd_ap(void) +{ + struct microcode_amd *mc; + size_t *usize; + void **ucode; + + mc = (struct microcode_amd *)__pa(amd_ucode_patch); + if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { + __apply_microcode_amd(mc); + return; + } + + ucode = (void *)__pa_nodebug(&container); + usize = (size_t *)__pa_nodebug(&container_size); + + if (!*ucode || !*usize) + return; + + apply_ucode_in_initrd(*ucode, *usize); +} + +static void __init collect_cpu_sig_on_bsp(void *arg) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->cpu_sig.sig = cpuid_eax(0x00000001); +} +#else +void load_ucode_amd_ap(void) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct equiv_cpu_entry *eq; + struct microcode_amd *mc; + u32 rev, eax; + u16 eq_id; + + /* Exit if called on the BSP. */ + if (!cpu) + return; + + if (!container) + return; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + uci->cpu_sig.rev = rev; + uci->cpu_sig.sig = eax; + + eax = cpuid_eax(0x00000001); + eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + + eq_id = find_equiv_id(eq, eax); + if (!eq_id) + return; + + if (eq_id == this_equiv_id) { + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) + ucode_new_rev = mc->hdr.patch_id; + } + + } else { + if (!ucode_cpio.data) + return; + + /* + * AP has a different equivalence ID than BSP, looks like + * mixed-steppings silicon so go through the ucode blob anew. + */ + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); + } +} +#endif + +int __init save_microcode_in_initrd_amd(void) +{ + enum ucode_state ret; + u32 eax; + +#ifdef CONFIG_X86_32 + unsigned int bsp = boot_cpu_data.cpu_index; + struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + + if (!uci->cpu_sig.sig) + smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); + + /* + * Take into account the fact that the ramdisk might get relocated + * and therefore we need to recompute the container's position in + * virtual memory space. + */ + container = (u8 *)(__va((u32)relocated_ramdisk) + + ((u32)container - boot_params.hdr.ramdisk_image)); +#endif + if (ucode_new_rev) + pr_info("microcode: updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + if (!container) + return -EINVAL; + + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(eax, container, container_size); + if (ret != UCODE_OK) + return -EINVAL; + + /* + * This will be freed any msec now, stash patches for the current + * family and switch to patch cache for cpu hotplug, etc later. + */ + container = NULL; + container_size = 0; + + return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c new file mode 100644 index 0000000..15c9876 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -0,0 +1,645 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman for the fix. + * 1.08 11 Dec 2000, Richard Schaal and + * Tigran Aivazian + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick and + * Tigran Aivazian , + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble , + * Jun Nakajima + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "2.00" + +static struct microcode_ops *microcode_ops; + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); + +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *buf, size_t size) +{ + int error = 0; + int cpu; + + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; + + if (!uci->valid) + continue; + + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + } + + return error; +} + +static int microcode_open(struct inode *inode, struct file *file) +{ + return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret = -EINVAL; + + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("too much data (max %ld pages)\n", totalram_pages); + return ret; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + if (do_microcode_update(buf, len) == 0) + ret = (ssize_t)len; + + if (ret > 0) + perf_check_microcode(); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, + .llseek = no_llseek, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .nodename = "cpu/microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void __exit microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int reload_for_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; + int err = 0; + + if (!uci->valid) + return err; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; + return err; +} + +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned long val; + int cpu; + ssize_t ret = 0, tmp_ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val != 1) + return size; + + get_online_cpus(); + mutex_lock(µcode_mutex); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; + } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); + + if (!ret) + ret = size; + + return ret; +} + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + microcode_ops->microcode_fini_cpu(cpu); +} + +static enum ucode_state microcode_resume_cpu(int cpu) +{ + pr_debug("CPU%d updated upon resume\n", cpu); + + if (apply_microcode_on_target(cpu)) + return UCODE_ERROR; + + return UCODE_OK; +} + +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) +{ + enum ucode_state ustate; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci && uci->valid) + return UCODE_OK; + + if (collect_cpu_info(cpu)) + return UCODE_ERROR; + + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, + refresh_fw); + + if (ustate == UCODE_OK) { + pr_debug("CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); + } + + return ustate; +} + +static enum ucode_state microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid) + return microcode_resume_cpu(cpu); + + return microcode_init_cpu(cpu, false); +} + +static int mc_device_add(struct device *dev, struct subsys_interface *sif) +{ + int err, cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("CPU%d added\n", cpu); + + err = sysfs_create_group(&dev->kobj, &mc_attr_group); + if (err) + return err; + + if (microcode_init_cpu(cpu, true) == UCODE_ERROR) + return -EINVAL; + + return err; +} + +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&dev->kobj, &mc_attr_group); + return 0; +} + +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) +{ + int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); +} + +static struct syscore_ops mc_syscore_ops = { + .resume = mc_bp_resume, +}; + +static int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *dev; + + dev = get_cpu_device(cpu); + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + microcode_update_cpu(cpu); + pr_debug("CPU%d added\n", cpu); + /* + * "break" is missing on purpose here because we want to fall + * through in order to create the sysfs group. + */ + + case CPU_DOWN_FAILED: + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + break; + + case CPU_DOWN_PREPARE: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("CPU%d removed\n", cpu); + break; + + /* + * case CPU_DEAD: + * + * When a CPU goes offline, don't free up or invalidate the copy of + * the microcode in kernel memory, so that we can reuse it when the + * CPU comes back online without unnecessarily requesting the userspace + * for it again. + */ + } + + /* The CPU refused to come up during a system resume */ + if (action == CPU_UP_CANCELED_FROZEN) + microcode_fini_cpu(cpu); + + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id __initconst microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD + { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif + {} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + +static int __init microcode_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + else + pr_err("no support for this CPU vendor\n"); + + if (!microcode_ops) + return -ENODEV; + + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) + return PTR_ERR(microcode_pdev); + + get_online_cpus(); + mutex_lock(µcode_mutex); + + error = subsys_interface_register(&mc_cpu_interface); + if (!error) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); + + if (error) + goto out_pdev; + + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + + error = microcode_dev_init(); + if (error) + goto out_ucode_group; + + register_syscore_ops(&mc_syscore_ops); + register_hotcpu_notifier(&mc_cpu_notifier); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION + " , Peter Oruba\n"); + + return 0; + + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: + get_online_cpus(); + mutex_lock(µcode_mutex); + + subsys_interface_unregister(&mc_cpu_interface); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + out_pdev: + platform_device_unregister(microcode_pdev); + return error; + +} +module_init(microcode_init); + +static void __exit microcode_exit(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + unregister_syscore_ops(&mc_syscore_ops); + + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + get_online_cpus(); + mutex_lock(µcode_mutex); + + subsys_interface_unregister(&mc_cpu_interface); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); + + microcode_ops = NULL; + + if (c->x86_vendor == X86_VENDOR_AMD) + exit_amd_microcode(); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +module_exit(microcode_exit); diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c new file mode 100644 index 0000000..be7f851 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -0,0 +1,141 @@ +/* + * X86 CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + * + * This driver allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly through cpuid. + */ +static int x86_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static int x86_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + int x86; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + x86 = (eax >> 8) & 0xf; + if (x86 == 15) + x86 += (eax >> 20) & 0xff; + + return x86; +} + +void __init load_ucode_bsp(void) +{ + int vendor, x86; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_bsp(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_bsp(); + break; + default: + break; + } +} + +void load_ucode_ap(void) +{ + int vendor, x86; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_ap(); + break; + default: + break; + } +} + +int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + save_microcode_in_initrd_amd(); + break; + default: + break; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c new file mode 100644 index 0000000..5fb2ceb --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -0,0 +1,333 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman for the fix. + * 1.08 11 Dec 2000, Richard Schaal and + * Tigran Aivazian + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick and + * Tigran Aivazian , + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble , + * Jun Nakajima + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned int val[2]; + + memset(csig, 0, sizeof(*csig)); + + csig->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig->pf = 1 << ((val[1] >> 18) & 7); + } + + csig->rev = c->microcode; + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); + + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +{ + struct cpu_signature cpu_sig; + unsigned int csig, cpf, crev; + + collect_cpu_info(cpu, &cpu_sig); + + csig = cpu_sig.sig; + cpf = cpu_sig.pf; + crev = cpu_sig.rev; + + return get_matching_microcode(csig, cpf, mc_intel, crev); +} + +int apply_microcode(int cpu) +{ + struct microcode_intel *mc_intel; + struct ucode_cpu_info *uci; + unsigned int val[2]; + int cpu_num = raw_smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + + uci = ucode_cpu_info + cpu; + mc_intel = uci->mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (mc_intel == NULL) + return 0; + + /* + * Microcode on this CPU could be updated earlier. Only apply the + * microcode patch in mc_intel when it is newer than the one on this + * CPU. + */ + if (get_matching_mc(mc_intel, cpu) == 0) + return 0; + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + if (val[1] != mc_intel->hdr.rev) { + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; + } + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", + cpu_num, val[1], + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); + + uci->cpu_sig.rev = val[1]; + c->microcode = val[1]; + + return 0; +} + +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int curr_mc_size = 0; + unsigned int csig, cpf; + + while (leftover) { + struct microcode_header_intel mc_header; + unsigned int mc_size; + + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) + break; + + mc_size = get_totalsize(&mc_header); + if (!mc_size || mc_size > leftover) { + pr_err("error! Bad data in microcode data file\n"); + break; + } + + /* For performance reasons, reuse mc area when possible */ + if (!mc || mc_size > curr_mc_size) { + vfree(mc); + mc = vmalloc(mc_size); + if (!mc) + break; + curr_mc_size = mc_size; + } + + if (get_ucode_data(mc, ucode_ptr, mc_size) || + microcode_sanity_check(mc, 1) < 0) { + break; + } + + csig = uci->cpu_sig.sig; + cpf = uci->cpu_sig.pf; + if (get_matching_microcode(csig, cpf, mc, new_rev)) { + vfree(new_mc); + new_rev = mc_header.rev; + new_mc = mc; + mc = NULL; /* trigger new vmalloc */ + } + + ucode_ptr += mc_size; + leftover -= mc_size; + } + + vfree(mc); + + if (leftover) { + vfree(new_mc); + state = UCODE_ERROR; + goto out; + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + + /* + * If early loading microcode is supported, save this mc into + * permanent memory. So it will be loaded early when a CPU is hot added + * or resumes. + */ + save_mc_for_early(new_mc); + + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); +out: + return state; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static enum ucode_state request_microcode_fw(int cpu, struct device *device, + bool refresh_fw) +{ + char name[30]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + enum ucode_state ret; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + + if (request_firmware(&firmware, name, device)) { + pr_debug("data file %s load failed\n", name); + return UCODE_NFOUND; + } + + ret = generic_load_microcode(cpu, (void *)firmware->data, + firmware->size, &get_ucode_fw); + + release_firmware(firmware); + + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + vfree(uci->mc); + uci->mc = NULL; +} + +static struct microcode_ops microcode_intel_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode, + .microcode_fini_cpu = microcode_fini_cpu, +}; + +struct microcode_ops * __init init_intel_microcode(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + return µcode_intel_ops; +} + diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c new file mode 100644 index 0000000..18f7391 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -0,0 +1,787 @@ +/* + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + * + * This allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +struct mc_saved_data { + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state +generic_load_microcode_early(struct microcode_intel **mc_saved_p, + unsigned int mc_saved_count, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *ucode_ptr, *new_mc = NULL; + int new_rev = uci->cpu_sig.rev; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + unsigned int csig = uci->cpu_sig.sig; + unsigned int cpf = uci->cpu_sig.pf; + int i; + + for (i = 0; i < mc_saved_count; i++) { + ucode_ptr = mc_saved_p[i]; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_size = get_totalsize(mc_header); + if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { + new_rev = mc_header->rev; + new_mc = ucode_ptr; + } + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + uci->mc = (struct microcode_intel *)new_mc; +out: + return state; +} + +static void +microcode_pointer(struct microcode_intel **mc_saved, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, int mc_saved_count) +{ + int i; + + for (i = 0; i < mc_saved_count; i++) + mc_saved[i] = (struct microcode_intel *) + (mc_saved_in_initrd[i] + initrd_start); +} + +#ifdef CONFIG_X86_32 +static void +microcode_phys(struct microcode_intel **mc_saved_tmp, + struct mc_saved_data *mc_saved_data) +{ + int i; + struct microcode_intel ***mc_saved; + + mc_saved = (struct microcode_intel ***) + __pa_nodebug(&mc_saved_data->mc_saved); + for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + struct microcode_intel *p; + + p = *(struct microcode_intel **) + __pa_nodebug(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + } +} +#endif + +static enum ucode_state +load_microcode(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int count = mc_saved_data->mc_saved_count; + + if (!mc_saved_data->mc_saved) { + microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, + initrd_start, count); + + return generic_load_microcode_early(mc_saved_tmp, count, uci); + } else { +#ifdef CONFIG_X86_32 + microcode_phys(mc_saved_tmp, mc_saved_data); + return generic_load_microcode_early(mc_saved_tmp, count, uci); +#else + return generic_load_microcode_early(mc_saved_data->mc_saved, + count, uci); +#endif + } +} + +static u8 get_x86_family(unsigned long sig) +{ + u8 x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static u8 get_x86_model(unsigned long sig) +{ + u8 x86, x86_model; + + x86 = get_x86_family(sig); + x86_model = (sig >> 4) & 0xf; + + if (x86 == 0x6 || x86 == 0xf) + x86_model += ((sig >> 16) & 0xf) << 4; + + return x86_model; +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, + unsigned long sig) +{ + u8 x86, x86_model; + u8 x86_ucode, x86_model_ucode; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + unsigned long data_size = get_datasize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + x86 = get_x86_family(sig); + x86_model = get_x86_model(sig); + + x86_ucode = get_x86_family(mc_header->sig); + x86_model_ucode = get_x86_model(mc_header->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + return UCODE_NFOUND; + + ext_header = (struct extended_sigtable *) + mc_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + x86_ucode = get_x86_family(ext_sig->sig); + x86_model_ucode = get_x86_model(ext_sig->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + ext_sig++; + } + + return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, + struct microcode_intel **mc_saved_src, + unsigned int mc_saved_count) +{ + int i, j; + struct microcode_intel **mc_saved_p; + int ret; + + if (!mc_saved_count) + return -EINVAL; + + /* + * Copy new microcode data. + */ + mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), + GFP_KERNEL); + if (!mc_saved_p) + return -ENOMEM; + + for (i = 0; i < mc_saved_count; i++) { + struct microcode_intel *mc = mc_saved_src[i]; + struct microcode_header_intel *mc_header = &mc->hdr; + unsigned long mc_size = get_totalsize(mc_header); + mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); + if (!mc_saved_p[i]) { + ret = -ENOMEM; + goto err; + } + if (!mc_saved_src[i]) { + ret = -EINVAL; + goto err; + } + memcpy(mc_saved_p[i], mc, mc_size); + } + + /* + * Point to newly saved microcode. + */ + mc_saved_data->mc_saved = mc_saved_p; + mc_saved_data->mc_saved_count = mc_saved_count; + + return 0; + +err: + for (j = 0; j <= i; j++) + kfree(mc_saved_p[j]); + kfree(mc_saved_p); + + return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + * patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + */ +static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, + unsigned int *mc_saved_count_p) +{ + int i; + int found = 0; + unsigned int mc_saved_count = *mc_saved_count_p; + struct microcode_header_intel *mc_header; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + for (i = 0; i < mc_saved_count; i++) { + unsigned int sig, pf; + unsigned int new_rev; + struct microcode_header_intel *mc_saved_header = + (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + new_rev = mc_header->rev; + + if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { + found = 1; + if (update_match_revision(mc_header, new_rev)) { + /* + * Found an older ucode saved before. + * Replace the older one with this newer + * one. + */ + mc_saved[i] = + (struct microcode_intel *)ucode_ptr; + break; + } + } + } + if (i >= mc_saved_count && !found) + /* + * This ucode is first time discovered in ucode file. + * Save it to memory. + */ + mc_saved[mc_saved_count++] = + (struct microcode_intel *)ucode_ptr; + + *mc_saved_count_p = mc_saved_count; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, + void *data, size_t size, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + u8 *ucode_ptr = data; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + int i; + + while (leftover) { + mc_header = (struct microcode_header_intel *)ucode_ptr; + + mc_size = get_totalsize(mc_header); + if (!mc_size || mc_size > leftover || + microcode_sanity_check(ucode_ptr, 0) < 0) + break; + + leftover -= mc_size; + + /* + * Since APs with same family and model as the BSP may boot in + * the platform, we need to find and save microcode patches + * with the same family and model as the BSP. + */ + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != + UCODE_OK) { + ucode_ptr += mc_size; + continue; + } + + _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + + ucode_ptr += mc_size; + } + + if (leftover) { + state = UCODE_ERROR; + goto out; + } + + if (mc_saved_count == 0) { + state = UCODE_NFOUND; + goto out; + } + + for (i = 0; i < mc_saved_count; i++) + mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + + mc_saved_data->mc_saved_count = mc_saved_count; +out: + return state; +} + +static int collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + u8 x86, x86_model; + struct cpu_signature csig; + unsigned int eax, ebx, ecx, edx; + + csig.sig = 0; + csig.pf = 0; + csig.rev = 0; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + x86 = get_x86_family(csig.sig); + x86_model = get_x86_model(csig.sig); + + if ((x86_model >= 5) || (x86 > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + csig.rev = val[1]; + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +#ifdef DEBUG +static void __ref show_saved_mc(void) +{ + int i, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + + if (mc_saved_data.mc_saved_count == 0) { + pr_debug("no micorcode data saved.\n"); + return; + } + pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", + smp_processor_id(), sig, pf, rev); + + for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + int ext_sigcount; + struct extended_signature *ext_sig; + + mc_saved_header = (struct microcode_header_intel *) + mc_saved_data.mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + date = mc_saved_header->date; + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + i, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (struct extended_sigtable *) + mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + + } +} +#else +static inline void show_saved_mc(void) +{ +} +#endif + +#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +static DEFINE_MUTEX(x86_cpu_microcode_mutex); +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count_init; + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; + int ret = 0; + int i; + + /* + * Hold hotplug lock so mc_saved_data is not accessed by a CPU in + * hotplug. + */ + mutex_lock(&x86_cpu_microcode_mutex); + + mc_saved_count_init = mc_saved_data.mc_saved_count; + mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved = mc_saved_data.mc_saved; + + if (mc_saved && mc_saved_count) + memcpy(mc_saved_tmp, mc_saved, + mc_saved_count * sizeof(struct mirocode_intel *)); + /* + * Save the microcode patch mc in mc_save_tmp structure if it's a newer + * version. + */ + + _save_mc(mc_saved_tmp, mc, &mc_saved_count); + + /* + * Save the mc_save_tmp in global mc_saved_data. + */ + ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + if (ret) { + pr_err("Cannot save microcode patch.\n"); + goto out; + } + + show_saved_mc(); + + /* + * Free old saved microcod data. + */ + if (mc_saved) { + for (i = 0; i < mc_saved_count_init; i++) + kfree(mc_saved[i]); + kfree(mc_saved); + } + +out: + mutex_unlock(&x86_cpu_microcode_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(unsigned long start, unsigned long end, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + unsigned int size = end - start + 1; + struct cpio_data cd; + long offset = 0; +#ifdef CONFIG_X86_32 + char *p = (char *)__pa_nodebug(ucode_name); +#else + char *p = ucode_name; +#endif + + cd.data = NULL; + cd.size = 0; + + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; + + + return get_matching_model_microcode(0, start, cd.data, cd.size, + mc_saved_data, mc_saved_in_initrd, + uci); +} + +/* + * Print ucode update info. + */ +static void +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + int cpu = smp_processor_id(); + + pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + cpu, + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void flush_tlb_early(void) +{ + __native_flush_tlb_global_irq_disabled(); +} + +static inline void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + unsigned int val[2]; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return 0; + + /* write microcode via MSR 0x79 */ + native_wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (val[1] != mc_intel->hdr.rev) + return -1; + +#ifdef CONFIG_X86_64 + /* Flush global tlb. This is precaution. */ + flush_tlb_early(); +#endif + uci->cpu_sig.rev = val[1]; + + print_ucode(uci); + + return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd_intel(void) +{ + unsigned int count = mc_saved_data.mc_saved_count; + struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; + int ret = 0; + + if (count == 0) + return ret; + + microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); + ret = save_microcode(&mc_saved_data, mc_saved, count); + if (ret) + pr_err("Cannot save microcode patches from initrd.\n"); + + show_saved_mc(); + + return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start_early, + unsigned long initrd_end_early, + struct ucode_cpu_info *uci) +{ + collect_cpu_info_early(uci); + scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, + mc_saved_in_initrd, uci); + load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + apply_microcode_early(mc_saved_data, uci); +} + +void __init +load_ucode_intel_bsp(void) +{ + u64 ramdisk_image, ramdisk_size; + unsigned long initrd_start_early, initrd_end_early; + struct ucode_cpu_info uci; +#ifdef CONFIG_X86_32 + struct boot_params *boot_params_p; + + boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); + ramdisk_image = boot_params_p->hdr.ramdisk_image; + ramdisk_size = boot_params_p->hdr.ramdisk_size; + initrd_start_early = ramdisk_image; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp( + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + initrd_start_early, initrd_end_early, &uci); +#else + ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_size = boot_params.hdr.ramdisk_size; + initrd_start_early = ramdisk_image + PAGE_OFFSET; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, + initrd_start_early, initrd_end_early, &uci); +#endif +} + +void load_ucode_intel_ap(void) +{ + struct mc_saved_data *mc_saved_data_p; + struct ucode_cpu_info uci; + unsigned long *mc_saved_in_initrd_p; + unsigned long initrd_start_addr; +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p; + + mc_saved_in_initrd_p = + (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); +#else + mc_saved_data_p = &mc_saved_data; + mc_saved_in_initrd_p = mc_saved_in_initrd; + initrd_start_addr = initrd_start; +#endif + + /* + * If there is no valid ucode previously saved in memory, no need to + * update ucode on this AP. + */ + if (mc_saved_data_p->mc_saved_count == 0) + return; + + collect_cpu_info_early(&uci); + load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + apply_microcode_early(mc_saved_data_p, &uci); +} diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c new file mode 100644 index 0000000..ce69320 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -0,0 +1,174 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include +#include +#include +#include + +#include +#include +#include + +static inline int +update_match_cpu(unsigned int csig, unsigned int cpf, + unsigned int sig, unsigned int pf) +{ + return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; +} + +int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} + +int microcode_sanity_check(void *mc, int print_err) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + int sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("error! Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("error! Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("error! Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("error! Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + if (print_err) + pr_warn("aborting, bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(microcode_sanity_check); + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + + if (!update_match_revision(mc_header, rev)) + return 0; + + return get_matching_sig(csig, cpf, mc, rev); +} +EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c deleted file mode 100644 index 4a6ff74..0000000 --- a/arch/x86/kernel/microcode_amd.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * AMD CPU Microcode Update Driver for Linux - * Copyright (C) 2008-2011 Advanced Micro Devices Inc. - * - * Author: Peter Oruba - * - * Based on work by: - * Tigran Aivazian - * - * Maintainers: - * Andreas Herrmann - * Borislav Petkov - * - * This driver allows to upgrade microcode on F10h AMD - * CPUs and later. - * - * Licensed under the terms of the GNU General Public - * License version 2. See file COPYING for details. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -MODULE_DESCRIPTION("AMD Microcode Update Driver"); -MODULE_AUTHOR("Peter Oruba"); -MODULE_LICENSE("GPL v2"); - -static struct equiv_cpu_entry *equiv_cpu_table; - -struct ucode_patch { - struct list_head plist; - void *data; - u32 patch_id; - u16 equiv_cpu; -}; - -static LIST_HEAD(pcache); - -static u16 __find_equiv_id(unsigned int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig); -} - -static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) -{ - int i = 0; - - BUG_ON(!equiv_cpu_table); - - while (equiv_cpu_table[i].equiv_cpu != 0) { - if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) - return equiv_cpu_table[i].installed_cpu; - i++; - } - return 0; -} - -/* - * a small, trivial cache of per-family ucode patches - */ -static struct ucode_patch *cache_find_patch(u16 equiv_cpu) -{ - struct ucode_patch *p; - - list_for_each_entry(p, &pcache, plist) - if (p->equiv_cpu == equiv_cpu) - return p; - return NULL; -} - -static void update_cache(struct ucode_patch *new_patch) -{ - struct ucode_patch *p; - - list_for_each_entry(p, &pcache, plist) { - if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) - /* we already have the latest patch */ - return; - - list_replace(&p->plist, &new_patch->plist); - kfree(p->data); - kfree(p); - return; - } - } - /* no patch found, add it */ - list_add_tail(&new_patch->plist, &pcache); -} - -static void free_cache(void) -{ - struct ucode_patch *p, *tmp; - - list_for_each_entry_safe(p, tmp, &pcache, plist) { - __list_del(p->plist.prev, p->plist.next); - kfree(p->data); - kfree(p); - } -} - -static struct ucode_patch *find_patch(unsigned int cpu) -{ - u16 equiv_id; - - equiv_id = __find_equiv_id(cpu); - if (!equiv_id) - return NULL; - - return cache_find_patch(equiv_id); -} - -static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct ucode_patch *p; - - csig->sig = cpuid_eax(0x00000001); - csig->rev = c->microcode; - - /* - * a patch could have been loaded early, set uci->mc so that - * mc_bp_resume() can call apply_microcode() - */ - p = find_patch(cpu); - if (p && (p->patch_id == csig->rev)) - uci->mc = p->data; - - pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); - - return 0; -} - -static unsigned int verify_patch_size(u8 family, u32 patch_size, - unsigned int size) -{ - u32 max_size; - -#define F1XH_MPB_MAX_SIZE 2048 -#define F14H_MPB_MAX_SIZE 1824 -#define F15H_MPB_MAX_SIZE 4096 -#define F16H_MPB_MAX_SIZE 3458 - - switch (family) { - case 0x14: - max_size = F14H_MPB_MAX_SIZE; - break; - case 0x15: - max_size = F15H_MPB_MAX_SIZE; - break; - case 0x16: - max_size = F16H_MPB_MAX_SIZE; - break; - default: - max_size = F1XH_MPB_MAX_SIZE; - break; - } - - if (patch_size > min_t(u32, size, max_size)) { - pr_err("patch size mismatch\n"); - return 0; - } - - return patch_size; -} - -int __apply_microcode_amd(struct microcode_amd *mc_amd) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) - return -1; - - return 0; -} - -int apply_microcode_amd(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct microcode_amd *mc_amd; - struct ucode_cpu_info *uci; - struct ucode_patch *p; - u32 rev, dummy; - - BUG_ON(raw_smp_processor_id() != cpu); - - uci = ucode_cpu_info + cpu; - - p = find_patch(cpu); - if (!p) - return 0; - - mc_amd = p->data; - uci->mc = p->data; - - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - /* need to apply patch? */ - if (rev >= mc_amd->hdr.patch_id) { - c->microcode = rev; - uci->cpu_sig.rev = rev; - return 0; - } - - if (__apply_microcode_amd(mc_amd)) { - pr_err("CPU%d: update failed for patch_level=0x%08x\n", - cpu, mc_amd->hdr.patch_id); - return -1; - } - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, - mc_amd->hdr.patch_id); - - uci->cpu_sig.rev = mc_amd->hdr.patch_id; - c->microcode = mc_amd->hdr.patch_id; - - return 0; -} - -static int install_equiv_cpu_table(const u8 *buf) -{ - unsigned int *ibuf = (unsigned int *)buf; - unsigned int type = ibuf[1]; - unsigned int size = ibuf[2]; - - if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("empty section/" - "invalid type field in container file section header\n"); - return -EINVAL; - } - - equiv_cpu_table = vmalloc(size); - if (!equiv_cpu_table) { - pr_err("failed to allocate equivalent CPU table\n"); - return -ENOMEM; - } - - memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); - - /* add header length */ - return size + CONTAINER_HDR_SZ; -} - -static void free_equiv_cpu_table(void) -{ - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; -} - -static void cleanup(void) -{ - free_equiv_cpu_table(); - free_cache(); -} - -/* - * We return the current size even if some of the checks failed so that - * we can skip over the next patch. If we return a negative value, we - * signal a grave error like a memory allocation has failed and the - * driver cannot continue functioning normally. In such cases, we tear - * down everything we've used up so far and exit. - */ -static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) -{ - struct microcode_header_amd *mc_hdr; - struct ucode_patch *patch; - unsigned int patch_size, crnt_size, ret; - u32 proc_fam; - u16 proc_id; - - patch_size = *(u32 *)(fw + 4); - crnt_size = patch_size + SECTION_HDR_SIZE; - mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); - proc_id = mc_hdr->processor_rev_id; - - proc_fam = find_cpu_family_by_equiv_cpu(proc_id); - if (!proc_fam) { - pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); - return crnt_size; - } - - /* check if patch is for the current family */ - proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); - if (proc_fam != family) - return crnt_size; - - if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { - pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", - mc_hdr->patch_id); - return crnt_size; - } - - ret = verify_patch_size(family, patch_size, leftover); - if (!ret) { - pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); - return crnt_size; - } - - patch = kzalloc(sizeof(*patch), GFP_KERNEL); - if (!patch) { - pr_err("Patch allocation failure.\n"); - return -EINVAL; - } - - patch->data = kzalloc(patch_size, GFP_KERNEL); - if (!patch->data) { - pr_err("Patch data allocation failure.\n"); - kfree(patch); - return -EINVAL; - } - - /* All looks ok, copy patch... */ - memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); - INIT_LIST_HEAD(&patch->plist); - patch->patch_id = mc_hdr->patch_id; - patch->equiv_cpu = proc_id; - - pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", - __func__, patch->patch_id, proc_id); - - /* ... and add to cache. */ - update_cache(patch); - - return crnt_size; -} - -static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, - size_t size) -{ - enum ucode_state ret = UCODE_ERROR; - unsigned int leftover; - u8 *fw = (u8 *)data; - int crnt_size = 0; - int offset; - - offset = install_equiv_cpu_table(data); - if (offset < 0) { - pr_err("failed to create equivalent cpu table\n"); - return ret; - } - fw += offset; - leftover = size - offset; - - if (*(u32 *)fw != UCODE_UCODE_TYPE) { - pr_err("invalid type field in container file section header\n"); - free_equiv_cpu_table(); - return ret; - } - - while (leftover) { - crnt_size = verify_and_add_patch(family, fw, leftover); - if (crnt_size < 0) - return ret; - - fw += crnt_size; - leftover -= crnt_size; - } - - return UCODE_OK; -} - -enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) -{ - enum ucode_state ret; - - /* free old equiv table */ - free_equiv_cpu_table(); - - ret = __load_microcode_amd(family, data, size); - - if (ret != UCODE_OK) - cleanup(); - -#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) - /* save BSP's matching patch for early load */ - if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(smp_processor_id()); - if (p) { - memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), - PATCH_MAX_SIZE)); - } - } -#endif - return ret; -} - -/* - * AMD microcode firmware naming convention, up to family 15h they are in - * the legacy file: - * - * amd-ucode/microcode_amd.bin - * - * This legacy file is always smaller than 2K in size. - * - * Beginning with family 15h, they are in family-specific firmware files: - * - * amd-ucode/microcode_amd_fam15h.bin - * amd-ucode/microcode_amd_fam16h.bin - * ... - * - * These might be larger than 2K. - */ -static enum ucode_state request_microcode_amd(int cpu, struct device *device, - bool refresh_fw) -{ - char fw_name[36] = "amd-ucode/microcode_amd.bin"; - struct cpuinfo_x86 *c = &cpu_data(cpu); - enum ucode_state ret = UCODE_NFOUND; - const struct firmware *fw; - - /* reload ucode container only on the boot cpu */ - if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) - return UCODE_OK; - - if (c->x86 >= 0x15) - snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); - - if (request_firmware(&fw, (const char *)fw_name, device)) { - pr_debug("failed to load file %s\n", fw_name); - goto out; - } - - ret = UCODE_ERROR; - if (*(u32 *)fw->data != UCODE_MAGIC) { - pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); - goto fw_release; - } - - ret = load_microcode_amd(c->x86, fw->data, fw->size); - - fw_release: - release_firmware(fw); - - out: - return ret; -} - -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - return UCODE_ERROR; -} - -static void microcode_fini_cpu_amd(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->mc = NULL; -} - -static struct microcode_ops microcode_amd_ops = { - .request_microcode_user = request_microcode_user, - .request_microcode_fw = request_microcode_amd, - .collect_cpu_info = collect_cpu_info_amd, - .apply_microcode = apply_microcode_amd, - .microcode_fini_cpu = microcode_fini_cpu_amd, -}; - -struct microcode_ops * __init init_amd_microcode(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("AMD CPU family 0x%x not supported\n", c->x86); - return NULL; - } - - return µcode_amd_ops; -} - -void __exit exit_amd_microcode(void) -{ - cleanup(); -} diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c deleted file mode 100644 index 8384c0f..0000000 --- a/arch/x86/kernel/microcode_amd_early.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (C) 2013 Advanced Micro Devices, Inc. - * - * Author: Jacob Shin - * Fixes: Borislav Petkov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include - -#include -#include -#include - -/* - * This points to the current valid container of microcode patches which we will - * save from the initrd before jettisoning its contents. - */ -static u8 *container; -static size_t container_size; - -static u32 ucode_new_rev; -u8 amd_ucode_patch[PATCH_MAX_SIZE]; -static u16 this_equiv_id; - -struct cpio_data ucode_cpio; - -/* - * Microcode patch container file is prepended to the initrd in cpio format. - * See Documentation/x86/early-microcode.txt - */ -static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - -static struct cpio_data __init find_ucode_in_initrd(void) -{ - long offset = 0; - char *path; - void *start; - size_t size; - -#ifdef CONFIG_X86_32 - struct boot_params *p; - - /* - * On 32-bit, early load occurs before paging is turned on so we need - * to use physical addresses. - */ - p = (struct boot_params *)__pa_nodebug(&boot_params); - path = (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; -#else - path = ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size = boot_params.hdr.ramdisk_size; -#endif - - return find_cpio_data(path, start, size, &offset); -} - -static size_t compute_container_size(u8 *data, u32 total_size) -{ - size_t size = 0; - u32 *header = (u32 *)data; - - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return size; - - size = header[2] + CONTAINER_HDR_SZ; - total_size -= size; - data += size; - - while (total_size) { - u16 patch_size; - - header = (u32 *)data; - - if (header[0] != UCODE_UCODE_TYPE) - break; - - /* - * Sanity-check patch size. - */ - patch_size = header[1]; - if (patch_size > PATCH_MAX_SIZE) - break; - - size += patch_size + SECTION_HDR_SIZE; - data += patch_size + SECTION_HDR_SIZE; - total_size -= patch_size + SECTION_HDR_SIZE; - } - - return size; -} - -/* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. - */ -static void apply_ucode_in_initrd(void *ucode, size_t size) -{ - struct equiv_cpu_entry *eq; - size_t *cont_sz; - u32 *header; - u8 *data, **cont; - u16 eq_id = 0; - int offset, left; - u32 rev, eax, ebx, ecx, edx; - u32 *new_rev; - -#ifdef CONFIG_X86_32 - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - cont_sz = (size_t *)__pa_nodebug(&container_size); - cont = (u8 **)__pa_nodebug(&container); -#else - new_rev = &ucode_new_rev; - cont_sz = &container_size; - cont = &container; -#endif - - data = ucode; - left = size; - header = (u32 *)data; - - /* find equiv cpu table */ - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return; - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - while (left > 0) { - eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); - - *cont = data; - - /* Advance past the container header */ - offset = header[2] + CONTAINER_HDR_SZ; - data += offset; - left -= offset; - - eq_id = find_equiv_id(eq, eax); - if (eq_id) { - this_equiv_id = eq_id; - *cont_sz = compute_container_size(*cont, left + offset); - - /* - * truncate how much we need to iterate over in the - * ucode update loop below - */ - left = *cont_sz - offset; - break; - } - - /* - * support multiple container files appended together. if this - * one does not have a matching equivalent cpu entry, we fast - * forward to the next container file. - */ - while (left > 0) { - header = (u32 *)data; - if (header[0] == UCODE_MAGIC && - header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) - break; - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } - - /* mark where the next microcode container file starts */ - offset = data - (u8 *)ucode; - ucode = data; - } - - if (!eq_id) { - *cont = NULL; - *cont_sz = 0; - return; - } - - /* find ucode and update if needed */ - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); - - while (left > 0) { - struct microcode_amd *mc; - - header = (u32 *)data; - if (header[0] != UCODE_UCODE_TYPE || /* type */ - header[1] == 0) /* size */ - break; - - mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); - - if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { - - if (!__apply_microcode_amd(mc)) { - rev = mc->hdr.patch_id; - *new_rev = rev; - - /* save ucode patch */ - memcpy(amd_ucode_patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); - } - } - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } -} - -void __init load_ucode_amd_bsp(void) -{ - struct cpio_data cp; - void **data; - size_t *size; - -#ifdef CONFIG_X86_32 - data = (void **)__pa_nodebug(&ucode_cpio.data); - size = (size_t *)__pa_nodebug(&ucode_cpio.size); -#else - data = &ucode_cpio.data; - size = &ucode_cpio.size; -#endif - - cp = find_ucode_in_initrd(); - if (!cp.data) - return; - - *data = cp.data; - *size = cp.size; - - apply_ucode_in_initrd(cp.data, cp.size); -} - -#ifdef CONFIG_X86_32 -/* - * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during - * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, - * which is used upon resume from suspend. - */ -void load_ucode_amd_ap(void) -{ - struct microcode_amd *mc; - size_t *usize; - void **ucode; - - mc = (struct microcode_amd *)__pa(amd_ucode_patch); - if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { - __apply_microcode_amd(mc); - return; - } - - ucode = (void *)__pa_nodebug(&container); - usize = (size_t *)__pa_nodebug(&container_size); - - if (!*ucode || !*usize) - return; - - apply_ucode_in_initrd(*ucode, *usize); -} - -static void __init collect_cpu_sig_on_bsp(void *arg) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->cpu_sig.sig = cpuid_eax(0x00000001); -} -#else -void load_ucode_amd_ap(void) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct equiv_cpu_entry *eq; - struct microcode_amd *mc; - u32 rev, eax; - u16 eq_id; - - /* Exit if called on the BSP. */ - if (!cpu) - return; - - if (!container) - return; - - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); - - uci->cpu_sig.rev = rev; - uci->cpu_sig.sig = eax; - - eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); - - eq_id = find_equiv_id(eq, eax); - if (!eq_id) - return; - - if (eq_id == this_equiv_id) { - mc = (struct microcode_amd *)amd_ucode_patch; - - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - ucode_new_rev = mc->hdr.patch_id; - } - - } else { - if (!ucode_cpio.data) - return; - - /* - * AP has a different equivalence ID than BSP, looks like - * mixed-steppings silicon so go through the ucode blob anew. - */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); - } -} -#endif - -int __init save_microcode_in_initrd_amd(void) -{ - enum ucode_state ret; - u32 eax; - -#ifdef CONFIG_X86_32 - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); - - /* - * Take into account the fact that the ramdisk might get relocated - * and therefore we need to recompute the container's position in - * virtual memory space. - */ - container = (u8 *)(__va((u32)relocated_ramdisk) + - ((u32)container - boot_params.hdr.ramdisk_image)); -#endif - if (ucode_new_rev) - pr_info("microcode: updated early to new patch_level=0x%08x\n", - ucode_new_rev); - - if (!container) - return -EINVAL; - - eax = cpuid_eax(0x00000001); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - - ret = load_microcode_amd(eax, container, container_size); - if (ret != UCODE_OK) - return -EINVAL; - - /* - * This will be freed any msec now, stash patches for the current - * family and switch to patch cache for cpu hotplug, etc later. - */ - container = NULL; - container_size = 0; - - return 0; -} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c deleted file mode 100644 index 15c9876..0000000 --- a/arch/x86/kernel/microcode_core.c +++ /dev/null @@ -1,645 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2006 Tigran Aivazian - * 2006 Shaohua Li - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman for the fix. - * 1.08 11 Dec 2000, Richard Schaal and - * Tigran Aivazian - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick and - * Tigran Aivazian , - * Serialize updates as required on HT processors due to - * speculative nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble , - * Jun Nakajima - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -MODULE_DESCRIPTION("Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "2.00" - -static struct microcode_ops *microcode_ops; - -/* - * Synchronization. - * - * All non cpu-hotplug-callback call sites use: - * - * - microcode_mutex to synchronize with each other; - * - get/put_online_cpus() to synchronize with - * the cpu-hotplug-callback call sites. - * - * We guarantee that only a single cpu is being - * updated at any particular moment of time. - */ -static DEFINE_MUTEX(microcode_mutex); - -struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -EXPORT_SYMBOL_GPL(ucode_cpu_info); - -/* - * Operations that are run on a target cpu: - */ - -struct cpu_info_ctx { - struct cpu_signature *cpu_sig; - int err; -}; - -static void collect_cpu_info_local(void *arg) -{ - struct cpu_info_ctx *ctx = arg; - - ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), - ctx->cpu_sig); -} - -static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) -{ - struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; - int ret; - - ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); - if (!ret) - ret = ctx.err; - - return ret; -} - -static int collect_cpu_info(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - int ret; - - memset(uci, 0, sizeof(*uci)); - - ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); - if (!ret) - uci->valid = 1; - - return ret; -} - -struct apply_microcode_ctx { - int err; -}; - -static void apply_microcode_local(void *arg) -{ - struct apply_microcode_ctx *ctx = arg; - - ctx->err = microcode_ops->apply_microcode(smp_processor_id()); -} - -static int apply_microcode_on_target(int cpu) -{ - struct apply_microcode_ctx ctx = { .err = 0 }; - int ret; - - ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); - if (!ret) - ret = ctx.err; - - return ret; -} - -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static int do_microcode_update(const void __user *buf, size_t size) -{ - int error = 0; - int cpu; - - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; - - if (!uci->valid) - continue; - - ustate = microcode_ops->request_microcode_user(cpu, buf, size); - if (ustate == UCODE_ERROR) { - error = -1; - break; - } else if (ustate == UCODE_OK) - apply_microcode_on_target(cpu); - } - - return error; -} - -static int microcode_open(struct inode *inode, struct file *file) -{ - return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; -} - -static ssize_t microcode_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - ssize_t ret = -EINVAL; - - if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("too much data (max %ld pages)\n", totalram_pages); - return ret; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - if (do_microcode_update(buf, len) == 0) - ret = (ssize_t)len; - - if (ret > 0) - perf_check_microcode(); - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, - .llseek = no_llseek, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .nodename = "cpu/microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init(void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void __exit microcode_dev_exit(void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -MODULE_ALIAS("devname:cpu/microcode"); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while (0) -#endif - -/* fake device for request_firmware */ -static struct platform_device *microcode_pdev; - -static int reload_for_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; - int err = 0; - - if (!uci->valid) - return err; - - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); - if (ustate == UCODE_OK) - apply_microcode_on_target(cpu); - else - if (ustate == UCODE_ERROR) - err = -EINVAL; - return err; -} - -static ssize_t reload_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) -{ - unsigned long val; - int cpu; - ssize_t ret = 0, tmp_ret; - - ret = kstrtoul(buf, 0, &val); - if (ret) - return ret; - - if (val != 1) - return size; - - get_online_cpus(); - mutex_lock(µcode_mutex); - for_each_online_cpu(cpu) { - tmp_ret = reload_for_cpu(cpu); - if (tmp_ret != 0) - pr_warn("Error reloading microcode on CPU %d\n", cpu); - - /* save retval of the first encountered reload error */ - if (!ret) - ret = tmp_ret; - } - if (!ret) - perf_check_microcode(); - mutex_unlock(µcode_mutex); - put_online_cpus(); - - if (!ret) - ret = size; - - return ret; -} - -static ssize_t version_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); -} - -static ssize_t pf_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); -} - -static DEVICE_ATTR(reload, 0200, NULL, reload_store); -static DEVICE_ATTR(version, 0400, version_show, NULL); -static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); - -static struct attribute *mc_default_attrs[] = { - &dev_attr_version.attr, - &dev_attr_processor_flags.attr, - NULL -}; - -static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", -}; - -static void microcode_fini_cpu(int cpu) -{ - microcode_ops->microcode_fini_cpu(cpu); -} - -static enum ucode_state microcode_resume_cpu(int cpu) -{ - pr_debug("CPU%d updated upon resume\n", cpu); - - if (apply_microcode_on_target(cpu)) - return UCODE_ERROR; - - return UCODE_OK; -} - -static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) -{ - enum ucode_state ustate; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (uci && uci->valid) - return UCODE_OK; - - if (collect_cpu_info(cpu)) - return UCODE_ERROR; - - /* --dimm. Trigger a delayed update? */ - if (system_state != SYSTEM_RUNNING) - return UCODE_NFOUND; - - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, - refresh_fw); - - if (ustate == UCODE_OK) { - pr_debug("CPU%d updated upon init\n", cpu); - apply_microcode_on_target(cpu); - } - - return ustate; -} - -static enum ucode_state microcode_update_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (uci->valid) - return microcode_resume_cpu(cpu); - - return microcode_init_cpu(cpu, false); -} - -static int mc_device_add(struct device *dev, struct subsys_interface *sif) -{ - int err, cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("CPU%d added\n", cpu); - - err = sysfs_create_group(&dev->kobj, &mc_attr_group); - if (err) - return err; - - if (microcode_init_cpu(cpu, true) == UCODE_ERROR) - return -EINVAL; - - return err; -} - -static int mc_device_remove(struct device *dev, struct subsys_interface *sif) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&dev->kobj, &mc_attr_group); - return 0; -} - -static struct subsys_interface mc_cpu_interface = { - .name = "microcode", - .subsys = &cpu_subsys, - .add_dev = mc_device_add, - .remove_dev = mc_device_remove, -}; - -/** - * mc_bp_resume - Update boot CPU microcode during resume. - */ -static void mc_bp_resume(void) -{ - int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (uci->valid && uci->mc) - microcode_ops->apply_microcode(cpu); -} - -static struct syscore_ops mc_syscore_ops = { - .resume = mc_bp_resume, -}; - -static int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct device *dev; - - dev = get_cpu_device(cpu); - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - microcode_update_cpu(cpu); - pr_debug("CPU%d added\n", cpu); - /* - * "break" is missing on purpose here because we want to fall - * through in order to create the sysfs group. - */ - - case CPU_DOWN_FAILED: - if (sysfs_create_group(&dev->kobj, &mc_attr_group)) - pr_err("Failed to create group for CPU%d\n", cpu); - break; - - case CPU_DOWN_PREPARE: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&dev->kobj, &mc_attr_group); - pr_debug("CPU%d removed\n", cpu); - break; - - /* - * case CPU_DEAD: - * - * When a CPU goes offline, don't free up or invalidate the copy of - * the microcode in kernel memory, so that we can reuse it when the - * CPU comes back online without unnecessarily requesting the userspace - * for it again. - */ - } - - /* The CPU refused to come up during a system resume */ - if (action == CPU_UP_CANCELED_FROZEN) - microcode_fini_cpu(cpu); - - return NOTIFY_OK; -} - -static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - -#ifdef MODULE -/* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id __initconst microcode_id[] = { -#ifdef CONFIG_MICROCODE_INTEL - { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, -#endif -#ifdef CONFIG_MICROCODE_AMD - { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, -#endif - {} -}; -MODULE_DEVICE_TABLE(x86cpu, microcode_id); -#endif - -static struct attribute *cpu_root_microcode_attrs[] = { - &dev_attr_reload.attr, - NULL -}; - -static struct attribute_group cpu_root_microcode_group = { - .name = "microcode", - .attrs = cpu_root_microcode_attrs, -}; - -static int __init microcode_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int error; - - if (c->x86_vendor == X86_VENDOR_INTEL) - microcode_ops = init_intel_microcode(); - else if (c->x86_vendor == X86_VENDOR_AMD) - microcode_ops = init_amd_microcode(); - else - pr_err("no support for this CPU vendor\n"); - - if (!microcode_ops) - return -ENODEV; - - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) - return PTR_ERR(microcode_pdev); - - get_online_cpus(); - mutex_lock(µcode_mutex); - - error = subsys_interface_register(&mc_cpu_interface); - if (!error) - perf_check_microcode(); - mutex_unlock(µcode_mutex); - put_online_cpus(); - - if (error) - goto out_pdev; - - error = sysfs_create_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - - if (error) { - pr_err("Error creating microcode group!\n"); - goto out_driver; - } - - error = microcode_dev_init(); - if (error) - goto out_ucode_group; - - register_syscore_ops(&mc_syscore_ops); - register_hotcpu_notifier(&mc_cpu_notifier); - - pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " , Peter Oruba\n"); - - return 0; - - out_ucode_group: - sysfs_remove_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - - out_driver: - get_online_cpus(); - mutex_lock(µcode_mutex); - - subsys_interface_unregister(&mc_cpu_interface); - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - out_pdev: - platform_device_unregister(microcode_pdev); - return error; - -} -module_init(microcode_init); - -static void __exit microcode_exit(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - unregister_syscore_ops(&mc_syscore_ops); - - sysfs_remove_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - - get_online_cpus(); - mutex_lock(µcode_mutex); - - subsys_interface_unregister(&mc_cpu_interface); - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); - - microcode_ops = NULL; - - if (c->x86_vendor == X86_VENDOR_AMD) - exit_amd_microcode(); - - pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -} -module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c deleted file mode 100644 index be7f851..0000000 --- a/arch/x86/kernel/microcode_core_early.c +++ /dev/null @@ -1,141 +0,0 @@ -/* - * X86 CPU microcode early update for Linux - * - * Copyright (C) 2012 Fenghua Yu - * H Peter Anvin" - * - * This driver allows to early upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture - * Software Developer's Manual. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include - -#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) -#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') -#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') -#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') -#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') -#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') -#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') - -#define CPUID_IS(a, b, c, ebx, ecx, edx) \ - (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) - -/* - * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. - * x86_vendor() gets vendor id for BSP. - * - * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify - * coding, we still use x86_vendor() to get vendor id for AP. - * - * x86_vendor() gets vendor information directly through cpuid. - */ -static int x86_vendor(void) -{ - u32 eax = 0x00000000; - u32 ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) - return X86_VENDOR_INTEL; - - if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) - return X86_VENDOR_AMD; - - return X86_VENDOR_UNKNOWN; -} - -static int x86_family(void) -{ - u32 eax = 0x00000001; - u32 ebx, ecx = 0, edx; - int x86; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - x86 = (eax >> 8) & 0xf; - if (x86 == 15) - x86 += (eax >> 20) & 0xff; - - return x86; -} - -void __init load_ucode_bsp(void) -{ - int vendor, x86; - - if (!have_cpuid_p()) - return; - - vendor = x86_vendor(); - x86 = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (x86 >= 6) - load_ucode_intel_bsp(); - break; - case X86_VENDOR_AMD: - if (x86 >= 0x10) - load_ucode_amd_bsp(); - break; - default: - break; - } -} - -void load_ucode_ap(void) -{ - int vendor, x86; - - if (!have_cpuid_p()) - return; - - vendor = x86_vendor(); - x86 = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (x86 >= 6) - load_ucode_intel_ap(); - break; - case X86_VENDOR_AMD: - if (x86 >= 0x10) - load_ucode_amd_ap(); - break; - default: - break; - } -} - -int __init save_microcode_in_initrd(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - if (c->x86 >= 6) - save_microcode_in_initrd_intel(); - break; - case X86_VENDOR_AMD: - if (c->x86 >= 0x10) - save_microcode_in_initrd_amd(); - break; - default: - break; - } - - return 0; -} diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c deleted file mode 100644 index 5fb2ceb..0000000 --- a/arch/x86/kernel/microcode_intel.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2006 Tigran Aivazian - * 2006 Shaohua Li - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman for the fix. - * 1.08 11 Dec 2000, Richard Schaal and - * Tigran Aivazian - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick and - * Tigran Aivazian , - * Serialize updates as required on HT processors due to - * speculative nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble , - * Jun Nakajima - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include - -#include -#include -#include - -MODULE_DESCRIPTION("Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); -MODULE_LICENSE("GPL"); - -static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned int val[2]; - - memset(csig, 0, sizeof(*csig)); - - csig->sig = cpuid_eax(0x00000001); - - if ((c->x86_model >= 5) || (c->x86 > 6)) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig->pf = 1 << ((val[1] >> 18) & 7); - } - - csig->rev = c->microcode; - pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); - - return 0; -} - -/* - * return 0 - no update found - * return 1 - found update - */ -static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) -{ - struct cpu_signature cpu_sig; - unsigned int csig, cpf, crev; - - collect_cpu_info(cpu, &cpu_sig); - - csig = cpu_sig.sig; - cpf = cpu_sig.pf; - crev = cpu_sig.rev; - - return get_matching_microcode(csig, cpf, mc_intel, crev); -} - -int apply_microcode(int cpu) -{ - struct microcode_intel *mc_intel; - struct ucode_cpu_info *uci; - unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - - uci = ucode_cpu_info + cpu; - mc_intel = uci->mc; - - /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); - - if (mc_intel == NULL) - return 0; - - /* - * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc_intel when it is newer than the one on this - * CPU. - */ - if (get_matching_mc(mc_intel, cpu) == 0) - return 0; - - /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - if (val[1] != mc_intel->hdr.rev) { - pr_err("CPU%d update to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); - return -1; - } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu_num, val[1], - mc_intel->hdr.date & 0xffff, - mc_intel->hdr.date >> 24, - (mc_intel->hdr.date >> 16) & 0xff); - - uci->cpu_sig.rev = val[1]; - c->microcode = val[1]; - - return 0; -} - -static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover = size; - enum ucode_state state = UCODE_OK; - unsigned int curr_mc_size = 0; - unsigned int csig, cpf; - - while (leftover) { - struct microcode_header_intel mc_header; - unsigned int mc_size; - - if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) - break; - - mc_size = get_totalsize(&mc_header); - if (!mc_size || mc_size > leftover) { - pr_err("error! Bad data in microcode data file\n"); - break; - } - - /* For performance reasons, reuse mc area when possible */ - if (!mc || mc_size > curr_mc_size) { - vfree(mc); - mc = vmalloc(mc_size); - if (!mc) - break; - curr_mc_size = mc_size; - } - - if (get_ucode_data(mc, ucode_ptr, mc_size) || - microcode_sanity_check(mc, 1) < 0) { - break; - } - - csig = uci->cpu_sig.sig; - cpf = uci->cpu_sig.pf; - if (get_matching_microcode(csig, cpf, mc, new_rev)) { - vfree(new_mc); - new_rev = mc_header.rev; - new_mc = mc; - mc = NULL; /* trigger new vmalloc */ - } - - ucode_ptr += mc_size; - leftover -= mc_size; - } - - vfree(mc); - - if (leftover) { - vfree(new_mc); - state = UCODE_ERROR; - goto out; - } - - if (!new_mc) { - state = UCODE_NFOUND; - goto out; - } - - vfree(uci->mc); - uci->mc = (struct microcode_intel *)new_mc; - - /* - * If early loading microcode is supported, save this mc into - * permanent memory. So it will be loaded early when a CPU is hot added - * or resumes. - */ - save_mc_for_early(new_mc); - - pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); -out: - return state; -} - -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - -static enum ucode_state request_microcode_fw(int cpu, struct device *device, - bool refresh_fw) -{ - char name[30]; - struct cpuinfo_x86 *c = &cpu_data(cpu); - const struct firmware *firmware; - enum ucode_state ret; - - sprintf(name, "intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); - - if (request_firmware(&firmware, name, device)) { - pr_debug("data file %s load failed\n", name); - return UCODE_NFOUND; - } - - ret = generic_load_microcode(cpu, (void *)firmware->data, - firmware->size, &get_ucode_fw); - - release_firmware(firmware); - - return ret; -} - -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); -} - -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - vfree(uci->mc); - uci->mc = NULL; -} - -static struct microcode_ops microcode_intel_ops = { - .request_microcode_user = request_microcode_user, - .request_microcode_fw = request_microcode_fw, - .collect_cpu_info = collect_cpu_info, - .apply_microcode = apply_microcode, - .microcode_fini_cpu = microcode_fini_cpu, -}; - -struct microcode_ops * __init init_intel_microcode(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - pr_err("Intel CPU family 0x%x not supported\n", c->x86); - return NULL; - } - - return µcode_intel_ops; -} - diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c deleted file mode 100644 index 18f7391..0000000 --- a/arch/x86/kernel/microcode_intel_early.c +++ /dev/null @@ -1,787 +0,0 @@ -/* - * Intel CPU microcode early update for Linux - * - * Copyright (C) 2012 Fenghua Yu - * H Peter Anvin" - * - * This allows to early upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture - * Software Developer's Manual. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; -struct mc_saved_data { - unsigned int mc_saved_count; - struct microcode_intel **mc_saved; -} mc_saved_data; - -static enum ucode_state -generic_load_microcode_early(struct microcode_intel **mc_saved_p, - unsigned int mc_saved_count, - struct ucode_cpu_info *uci) -{ - struct microcode_intel *ucode_ptr, *new_mc = NULL; - int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; - unsigned int csig = uci->cpu_sig.sig; - unsigned int cpf = uci->cpu_sig.pf; - int i; - - for (i = 0; i < mc_saved_count; i++) { - ucode_ptr = mc_saved_p[i]; - - mc_header = (struct microcode_header_intel *)ucode_ptr; - mc_size = get_totalsize(mc_header); - if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { - new_rev = mc_header->rev; - new_mc = ucode_ptr; - } - } - - if (!new_mc) { - state = UCODE_NFOUND; - goto out; - } - - uci->mc = (struct microcode_intel *)new_mc; -out: - return state; -} - -static void -microcode_pointer(struct microcode_intel **mc_saved, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start, int mc_saved_count) -{ - int i; - - for (i = 0; i < mc_saved_count; i++) - mc_saved[i] = (struct microcode_intel *) - (mc_saved_in_initrd[i] + initrd_start); -} - -#ifdef CONFIG_X86_32 -static void -microcode_phys(struct microcode_intel **mc_saved_tmp, - struct mc_saved_data *mc_saved_data) -{ - int i; - struct microcode_intel ***mc_saved; - - mc_saved = (struct microcode_intel ***) - __pa_nodebug(&mc_saved_data->mc_saved); - for (i = 0; i < mc_saved_data->mc_saved_count; i++) { - struct microcode_intel *p; - - p = *(struct microcode_intel **) - __pa_nodebug(mc_saved_data->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); - } -} -#endif - -static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start, - struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data->mc_saved_count; - - if (!mc_saved_data->mc_saved) { - microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, - initrd_start, count); - - return generic_load_microcode_early(mc_saved_tmp, count, uci); - } else { -#ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mc_saved_data); - return generic_load_microcode_early(mc_saved_tmp, count, uci); -#else - return generic_load_microcode_early(mc_saved_data->mc_saved, - count, uci); -#endif - } -} - -static u8 get_x86_family(unsigned long sig) -{ - u8 x86; - - x86 = (sig >> 8) & 0xf; - - if (x86 == 0xf) - x86 += (sig >> 20) & 0xff; - - return x86; -} - -static u8 get_x86_model(unsigned long sig) -{ - u8 x86, x86_model; - - x86 = get_x86_family(sig); - x86_model = (sig >> 4) & 0xf; - - if (x86 == 0x6 || x86 == 0xf) - x86_model += ((sig >> 16) & 0xf) << 4; - - return x86_model; -} - -/* - * Given CPU signature and a microcode patch, this function finds if the - * microcode patch has matching family and model with the CPU. - */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) -{ - u8 x86, x86_model; - u8 x86_ucode, x86_model_ucode; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - - x86 = get_x86_family(sig); - x86_model = get_x86_model(sig); - - x86_ucode = get_x86_family(mc_header->sig); - x86_model_ucode = get_x86_model(mc_header->sig); - - if (x86 == x86_ucode && x86_model == x86_model_ucode) - return UCODE_OK; - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; - - ext_header = (struct extended_sigtable *) - mc_header + data_size + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - - for (i = 0; i < ext_sigcount; i++) { - x86_ucode = get_x86_family(ext_sig->sig); - x86_model_ucode = get_x86_model(ext_sig->sig); - - if (x86 == x86_ucode && x86_model == x86_model_ucode) - return UCODE_OK; - - ext_sig++; - } - - return UCODE_NFOUND; -} - -static int -save_microcode(struct mc_saved_data *mc_saved_data, - struct microcode_intel **mc_saved_src, - unsigned int mc_saved_count) -{ - int i, j; - struct microcode_intel **mc_saved_p; - int ret; - - if (!mc_saved_count) - return -EINVAL; - - /* - * Copy new microcode data. - */ - mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), - GFP_KERNEL); - if (!mc_saved_p) - return -ENOMEM; - - for (i = 0; i < mc_saved_count; i++) { - struct microcode_intel *mc = mc_saved_src[i]; - struct microcode_header_intel *mc_header = &mc->hdr; - unsigned long mc_size = get_totalsize(mc_header); - mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); - if (!mc_saved_p[i]) { - ret = -ENOMEM; - goto err; - } - if (!mc_saved_src[i]) { - ret = -EINVAL; - goto err; - } - memcpy(mc_saved_p[i], mc, mc_size); - } - - /* - * Point to newly saved microcode. - */ - mc_saved_data->mc_saved = mc_saved_p; - mc_saved_data->mc_saved_count = mc_saved_count; - - return 0; - -err: - for (j = 0; j <= i; j++) - kfree(mc_saved_p[j]); - kfree(mc_saved_p); - - return ret; -} - -/* - * A microcode patch in ucode_ptr is saved into mc_saved - * - if it has matching signature and newer revision compared to an existing - * patch mc_saved. - * - or if it is a newly discovered microcode patch. - * - * The microcode patch should have matching model with CPU. - */ -static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, - unsigned int *mc_saved_count_p) -{ - int i; - int found = 0; - unsigned int mc_saved_count = *mc_saved_count_p; - struct microcode_header_intel *mc_header; - - mc_header = (struct microcode_header_intel *)ucode_ptr; - for (i = 0; i < mc_saved_count; i++) { - unsigned int sig, pf; - unsigned int new_rev; - struct microcode_header_intel *mc_saved_header = - (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - new_rev = mc_header->rev; - - if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { - found = 1; - if (update_match_revision(mc_header, new_rev)) { - /* - * Found an older ucode saved before. - * Replace the older one with this newer - * one. - */ - mc_saved[i] = - (struct microcode_intel *)ucode_ptr; - break; - } - } - } - if (i >= mc_saved_count && !found) - /* - * This ucode is first time discovered in ucode file. - * Save it to memory. - */ - mc_saved[mc_saved_count++] = - (struct microcode_intel *)ucode_ptr; - - *mc_saved_count_p = mc_saved_count; -} - -/* - * Get microcode matching with BSP's model. Only CPUs with the same model as - * BSP can stay in the platform. - */ -static enum ucode_state __init -get_matching_model_microcode(int cpu, unsigned long start, - void *data, size_t size, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - struct ucode_cpu_info *uci) -{ - u8 *ucode_ptr = data; - unsigned int leftover = size; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mc_saved_data->mc_saved_count; - int i; - - while (leftover) { - mc_header = (struct microcode_header_intel *)ucode_ptr; - - mc_size = get_totalsize(mc_header); - if (!mc_size || mc_size > leftover || - microcode_sanity_check(ucode_ptr, 0) < 0) - break; - - leftover -= mc_size; - - /* - * Since APs with same family and model as the BSP may boot in - * the platform, we need to find and save microcode patches - * with the same family and model as the BSP. - */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { - ucode_ptr += mc_size; - continue; - } - - _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); - - ucode_ptr += mc_size; - } - - if (leftover) { - state = UCODE_ERROR; - goto out; - } - - if (mc_saved_count == 0) { - state = UCODE_NFOUND; - goto out; - } - - for (i = 0; i < mc_saved_count; i++) - mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; - - mc_saved_data->mc_saved_count = mc_saved_count; -out: - return state; -} - -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - u8 x86, x86_model; - struct cpu_signature csig; - unsigned int eax, ebx, ecx, edx; - - csig.sig = 0; - csig.pf = 0; - csig.rev = 0; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - x86 = get_x86_family(csig.sig); - x86_model = get_x86_model(csig.sig); - - if ((x86_model >= 5) || (x86 > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - csig.rev = val[1]; - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - -#ifdef DEBUG -static void __ref show_saved_mc(void) -{ - int i, j; - unsigned int sig, pf, rev, total_size, data_size, date; - struct ucode_cpu_info uci; - - if (mc_saved_data.mc_saved_count == 0) { - pr_debug("no micorcode data saved.\n"); - return; - } - pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); - - collect_cpu_info_early(&uci); - - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; - pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", - smp_processor_id(), sig, pf, rev); - - for (i = 0; i < mc_saved_data.mc_saved_count; i++) { - struct microcode_header_intel *mc_saved_header; - struct extended_sigtable *ext_header; - int ext_sigcount; - struct extended_signature *ext_sig; - - mc_saved_header = (struct microcode_header_intel *) - mc_saved_data.mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - date = mc_saved_header->date; - - pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", - i, sig, pf, rev, total_size, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - continue; - - ext_header = (struct extended_sigtable *) - mc_saved_header + data_size + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - - for (j = 0; j < ext_sigcount; j++) { - sig = ext_sig->sig; - pf = ext_sig->pf; - - pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", - j, sig, pf); - - ext_sig++; - } - - } -} -#else -static inline void show_saved_mc(void) -{ -} -#endif - -#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) -static DEFINE_MUTEX(x86_cpu_microcode_mutex); -/* - * Save this mc into mc_saved_data. So it will be loaded early when a CPU is - * hot added or resumes. - * - * Please make sure this mc should be a valid microcode patch before calling - * this function. - */ -int save_mc_for_early(u8 *mc) -{ - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count_init; - unsigned int mc_saved_count; - struct microcode_intel **mc_saved; - int ret = 0; - int i; - - /* - * Hold hotplug lock so mc_saved_data is not accessed by a CPU in - * hotplug. - */ - mutex_lock(&x86_cpu_microcode_mutex); - - mc_saved_count_init = mc_saved_data.mc_saved_count; - mc_saved_count = mc_saved_data.mc_saved_count; - mc_saved = mc_saved_data.mc_saved; - - if (mc_saved && mc_saved_count) - memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct mirocode_intel *)); - /* - * Save the microcode patch mc in mc_save_tmp structure if it's a newer - * version. - */ - - _save_mc(mc_saved_tmp, mc, &mc_saved_count); - - /* - * Save the mc_save_tmp in global mc_saved_data. - */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); - if (ret) { - pr_err("Cannot save microcode patch.\n"); - goto out; - } - - show_saved_mc(); - - /* - * Free old saved microcod data. - */ - if (mc_saved) { - for (i = 0; i < mc_saved_count_init; i++) - kfree(mc_saved[i]); - kfree(mc_saved); - } - -out: - mutex_unlock(&x86_cpu_microcode_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(save_mc_for_early); -#endif - -static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; -static __init enum ucode_state -scan_microcode(unsigned long start, unsigned long end, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - struct ucode_cpu_info *uci) -{ - unsigned int size = end - start + 1; - struct cpio_data cd; - long offset = 0; -#ifdef CONFIG_X86_32 - char *p = (char *)__pa_nodebug(ucode_name); -#else - char *p = ucode_name; -#endif - - cd.data = NULL; - cd.size = 0; - - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) - return UCODE_ERROR; - - - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, mc_saved_in_initrd, - uci); -} - -/* - * Print ucode update info. - */ -static void -print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) -{ - int cpu = smp_processor_id(); - - pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, - uci->cpu_sig.rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); -} - -#ifdef CONFIG_X86_32 - -static int delay_ucode_info; -static int current_mc_date; - -/* - * Print early updated ucode info after printk works. This is delayed info dump. - */ -void show_ucode_info_early(void) -{ - struct ucode_cpu_info uci; - - if (delay_ucode_info) { - collect_cpu_info_early(&uci); - print_ucode_info(&uci, current_mc_date); - delay_ucode_info = 0; - } -} - -/* - * At this point, we can not call printk() yet. Keep microcode patch number in - * mc_saved_data.mc_saved and delay printing microcode info in - * show_ucode_info_early() until printk() works. - */ -static void print_ucode(struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_intel; - int *delay_ucode_info_p; - int *current_mc_date_p; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return; - - delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); - current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); - - *delay_ucode_info_p = 1; - *current_mc_date_p = mc_intel->hdr.date; -} -#else - -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ - __native_flush_tlb_global_irq_disabled(); -} - -static inline void print_ucode(struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_intel; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return; - - print_ucode_info(uci, mc_intel->hdr.date); -} -#endif - -static int apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_intel; - unsigned int val[2]; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return 0; - - /* write microcode via MSR 0x79 */ - native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) - return -1; - -#ifdef CONFIG_X86_64 - /* Flush global tlb. This is precaution. */ - flush_tlb_early(); -#endif - uci->cpu_sig.rev = val[1]; - - print_ucode(uci); - - return 0; -} - -/* - * This function converts microcode patch offsets previously stored in - * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. - */ -int __init save_microcode_in_initrd_intel(void) -{ - unsigned int count = mc_saved_data.mc_saved_count; - struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - int ret = 0; - - if (count == 0) - return ret; - - microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); - ret = save_microcode(&mc_saved_data, mc_saved, count); - if (ret) - pr_err("Cannot save microcode patches from initrd.\n"); - - show_saved_mc(); - - return ret; -} - -static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start_early, - unsigned long initrd_end_early, - struct ucode_cpu_info *uci) -{ - collect_cpu_info_early(uci); - scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, - mc_saved_in_initrd, uci); - load_microcode(mc_saved_data, mc_saved_in_initrd, - initrd_start_early, uci); - apply_microcode_early(mc_saved_data, uci); -} - -void __init -load_ucode_intel_bsp(void) -{ - u64 ramdisk_image, ramdisk_size; - unsigned long initrd_start_early, initrd_end_early; - struct ucode_cpu_info uci; -#ifdef CONFIG_X86_32 - struct boot_params *boot_params_p; - - boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); - ramdisk_image = boot_params_p->hdr.ramdisk_image; - ramdisk_size = boot_params_p->hdr.ramdisk_size; - initrd_start_early = ramdisk_image; - initrd_end_early = initrd_start_early + ramdisk_size; - - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - initrd_start_early, initrd_end_early, &uci); -#else - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; - initrd_start_early = ramdisk_image + PAGE_OFFSET; - initrd_end_early = initrd_start_early + ramdisk_size; - - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, - initrd_start_early, initrd_end_early, &uci); -#endif -} - -void load_ucode_intel_ap(void) -{ - struct mc_saved_data *mc_saved_data_p; - struct ucode_cpu_info uci; - unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; -#ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); -#else - mc_saved_data_p = &mc_saved_data; - mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; -#endif - - /* - * If there is no valid ucode previously saved in memory, no need to - * update ucode on this AP. - */ - if (mc_saved_data_p->mc_saved_count == 0) - return; - - collect_cpu_info_early(&uci); - load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); - apply_microcode_early(mc_saved_data_p, &uci); -} diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/microcode_intel_lib.c deleted file mode 100644 index ce69320..0000000 --- a/arch/x86/kernel/microcode_intel_lib.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2012 Fenghua Yu - * H Peter Anvin" - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ -#include -#include -#include -#include - -#include -#include -#include - -static inline int -update_match_cpu(unsigned int csig, unsigned int cpf, - unsigned int sig, unsigned int pf) -{ - return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; -} - -int -update_match_revision(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} - -int microcode_sanity_check(void *mc, int print_err) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - int sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("error! Bad data size in microcode data file\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - if (print_err) - pr_err("error! Unknown microcode update format\n"); - return -EINVAL; - } - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("error! Small exttable size in microcode data file\n"); - return -EINVAL; - } - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("error! Bad exttable size in microcode data file\n"); - return -EFAULT; - } - ext_sigcount = ext_header->count; - } - - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; - - i = ext_table_size / DWSIZE; - while (i--) - ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - if (print_err) - pr_warn("aborting, bad extended signature table checksum\n"); - return -EINVAL; - } - } - - /* calculate the checksum */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) - orig_sum += ((int *)mc)[i]; - if (orig_sum) { - if (print_err) - pr_err("aborting, bad checksum\n"); - return -EINVAL; - } - if (!ext_table_size) - return 0; - /* check extended signature checksum */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("aborting, bad checksum\n"); - return -EINVAL; - } - } - return 0; -} -EXPORT_SYMBOL_GPL(microcode_sanity_check); - -/* - * return 0 - no update found - * return 1 - found update - */ -int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) -{ - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - - if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) - return 1; - - /* Look for ext. headers: */ - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) - return 0; - - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - - for (i = 0; i < ext_sigcount; i++) { - if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} - -/* - * return 0 - no update found - * return 1 - found update - */ -int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) -{ - struct microcode_header_intel *mc_header = mc; - - if (!update_match_revision(mc_header, rev)) - return 0; - - return get_matching_sig(csig, cpf, mc, rev); -} -EXPORT_SYMBOL_GPL(get_matching_microcode); -- cgit v1.1 From 60283df7ac26a4fe2d56631ca2946e04725e7eaf Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 14 Jan 2014 08:44:47 +0100 Subject: x86/apic: Read Error Status Register correctly Currently we do a read, a dummy write and a final read to fetch the error code. The value from the final read is taken. This is not the recommended way and leads to corrupted/lost ESR values. Intel(c) 64 and IA-32 Architectures Software Developer's Manual, Combined Volumes 1, 2ABC, 3ABC, Section 10.5.3 states: Before attempt to read from the ESR, software should first write to it. (The value written does not affect the values read subsequently; only zero may be written in x2APIC mode.) This write clears any previously logged errors and updates the ESR with any errors detected since the last write to the ESR. This write also rearms the APIC error interrupt triggering mechanism. This patch removes the first read such that we are conform with the manual. On my (very old) Pentium MMX SMP system this patch fixes the issue that APIC errors: a) are not always reported and b) are reported with false error numbers. Signed-off-by: Richard Weinberger Cc: seiji.aguchi@hds.com Cc: rientjes@google.com Cc: konrad.wilk@oracle.com Cc: bp@alien8.de Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1389685487-20872-1-git-send-email-richard@nod.at Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d278736..4ec1dd6 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1968,7 +1968,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs) */ static inline void __smp_error_interrupt(struct pt_regs *regs) { - u32 v0, v1; + u32 v; u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ @@ -1982,21 +1982,20 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) }; /* First tickle the hardware, only then report what went on. -- REW */ - v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); + v = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); - apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", - smp_processor_id(), v0 , v1); + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", + smp_processor_id(), v); - v1 = v1 & 0xff; - while (v1) { - if (v1 & 0x1) + v &= 0xff; + while (v) { + if (v & 0x1) apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); i++; - v1 >>= 1; + v >>= 1; } apic_printk(APIC_DEBUG, KERN_CONT "\n"); -- cgit v1.1 From 19259943f0954dcd1817f94776376bf51c6a46d5 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 7 Dec 2013 21:02:36 +0800 Subject: x86, kaslr: Remove unused including Remove including that don't need it. Signed-off-by: Wei Yongjun Link: http://lkml.kernel.org/r/CAPgLHd-Fjx1RybjWFAu1vHRfTvhWwMLL3x46BouC5uNxHPjy1A@mail.gmail.com Acked-by: Kees Cook Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/aslr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 84be175..90a21f4 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -10,7 +10,6 @@ #include #include #include -#include /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" -- cgit v1.1 From da2b6fb990cf782b18952f534ec7323453bc4fc9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Dec 2013 12:27:45 -0800 Subject: x86, kaslr: Clarify RANDOMIZE_BASE_MAX_OFFSET The help text for RANDOMIZE_BASE_MAX_OFFSET was confusing. This has been clarified, and updated to be an export-only tunable. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20131210202745.GA2961@www.outflux.net Acked-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 596cd9e..5c9e19d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1747,26 +1747,33 @@ config RANDOMIZE_BASE possible. At best, due to page table layouts, 64-bit can use 9 bits of entropy and 32-bit uses 8 bits. + If unsure, say N. + config RANDOMIZE_BASE_MAX_OFFSET - hex "Maximum ASLR offset allowed" + hex "Maximum kASLR offset allowed" if EXPERT depends on RANDOMIZE_BASE range 0x0 0x20000000 if X86_32 default "0x20000000" if X86_32 range 0x0 0x40000000 if X86_64 default "0x40000000" if X86_64 ---help--- - Determines the maximal offset in bytes that will be applied to the - kernel when Address Space Layout Randomization (ASLR) is active. - Must be less than or equal to the actual physical memory on the - system. This must be a multiple of CONFIG_PHYSICAL_ALIGN. + The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical + memory is used to determine the maximal offset in bytes that will + be applied to the kernel when kernel Address Space Layout + Randomization (kASLR) is active. This must be a multiple of + PHYSICAL_ALIGN. + + On 32-bit this is limited to 512MiB by page table layouts. The + default is 512MiB. - On 32-bit this is limited to 512MiB. + On 64-bit this is limited by how the kernel fixmap page table is + positioned, so this cannot be larger than 1GiB currently. Without + RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel + and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the + modules area will shrink to compensate, up to the current maximum + 1GiB to 1GiB split. The default is 1GiB. - On 64-bit this is limited by how the kernel fixmap page table is - positioned, so this cannot be larger that 1GiB currently. Normally - there is a 512MiB to 1.5GiB split between kernel and modules. When - this is raised above the 512MiB default, the modules area will - shrink to compensate, up to the current maximum 1GiB to 1GiB split. + If unsure, leave at the default value. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS -- cgit v1.1 From d139336700a5f3a560da235e4dfcd286773025d4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Jan 2014 12:52:15 +0100 Subject: x86, cpu, amd: Fix a shadowed variable situation Having u32 and struct cpuinfo_x86 * by the same name is not very smart, although it was ok in this case due to the limited scope of u32 c and it being used only once in there. Fix this. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1389786735-16751-1-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 39bc78d..e5647ab 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -789,14 +789,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) } /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ - if (!((eax >> 16) & mask)) { - u32 a, b, c, d; - - cpuid(0x80000005, &a, &b, &c, &d); - tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; - } else { + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; - } /* a 4M entry uses two 2M entries */ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; -- cgit v1.1 From 0dce7cd67fd9055c4a2ff278f8af1431e646d346 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 15 Jan 2014 13:39:59 +0100 Subject: kvm: x86: fix apic_base enable check Commit e66d2ae7c67bd moved the assignment vcpu->arch.apic_base = value above a condition with (vcpu->arch.apic_base ^ value), causing that check to always fail. Use old_value, vcpu->arch.apic_base's old value, in the condition instead. Signed-off-by: Andrew Jones Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1673940..775702f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1355,7 +1355,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; /* update jump label if enable bit changes */ - if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) { + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) static_key_slow_dec_deferred(&apic_hw_disabled); else -- cgit v1.1 From 151e0c7de616310f95393d9306903900fcd8b277 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 15 Jan 2014 15:44:58 +0900 Subject: x86, apic, kexec: Add disable_cpu_apicid kernel parameter Add disable_cpu_apicid kernel parameter. To use this kernel parameter, specify an initial APIC ID of the corresponding CPU you want to disable. This is mostly used for the kdump 2nd kernel to disable BSP to wake up multiple CPUs without causing system reset or hang due to sending INIT from AP to BSP. Kdump users first figure out initial APIC ID of the BSP, CPU0 in the 1st kernel, for example from /proc/cpuinfo and then set up this kernel parameter for the 2nd kernel using the obtained APIC ID. However, doing this procedure at each boot time manually is awkward, which should be automatically done by user-land service scripts, for example, kexec-tools on fedora/RHEL distributions. This design is more flexible than disabling BSP in kernel boot time automatically in that in kernel boot time we have no choice but referring to ACPI/MP table to obtain initial APIC ID for BSP, meaning that the method is not applicable to the systems without such BIOS tables. One assumption behind this design is that users get initial APIC ID of the BSP in still healthy state and so BSP is uniquely kept in CPU0. Thus, through the kernel parameter, only one initial APIC ID can be specified. In a comparison with disabled_cpu_apicid, we use read_apic_id(), not boot_cpu_physical_apicid, because on some platforms, the variable is modified to the apicid reported as BSP through MP table and this function is executed with the temporarily modified boot_cpu_physical_apicid. As a result, disabled_cpu_apicid kernel parameter doesn't work well for apicids of APs. Fixing the wrong handling of boot_cpu_physical_apicid requires some reviews and tests beyond some platforms and it could take some time. The fix here is a kind of workaround to focus on the main topic of this patch. Signed-off-by: HATAYAMA Daisuke Link: http://lkml.kernel.org/r/20140115064458.1545.38775.stgit@localhost6.localdomain6 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4ec1dd6..e78ab8c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -75,6 +75,13 @@ unsigned int max_physical_apicid; physid_mask_t phys_cpu_present_map; /* + * Processor to be disabled specified by kernel parameter + * disable_cpu_apicid=, mostly used for the kdump 2nd kernel to + * avoid undefined behaviour caused by sending INIT from AP to BSP. + */ +unsigned int disabled_cpu_apicid = BAD_APICID; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -2114,6 +2121,39 @@ int generic_processor_info(int apicid, int version) phys_cpu_present_map); /* + * boot_cpu_physical_apicid is designed to have the apicid + * returned by read_apic_id(), i.e, the apicid of the + * currently booting-up processor. However, on some platforms, + * it is temporarilly modified by the apicid reported as BSP + * through MP table. Concretely: + * + * - arch/x86/kernel/mpparse.c: MP_processor_info() + * - arch/x86/mm/amdtopology.c: amd_numa_init() + * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info() + * + * This function is executed with the modified + * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel + * parameter doesn't work to disable APs on kdump 2nd kernel. + * + * Since fixing handling of boot_cpu_physical_apicid requires + * another discussion and tests on each platform, we leave it + * for now and here we use read_apic_id() directly in this + * function, generic_processor_info(). + */ + if (disabled_cpu_apicid != BAD_APICID && + disabled_cpu_apicid != read_apic_id() && + disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warning("ACPI: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", + thiscpu, apicid); + + disabled_cpus++; + return -ENODEV; + } + + /* * If boot cpu has not been detected yet, then only allow upto * nr_cpu_ids - 1 processors and keep one slot free for boot cpu */ @@ -2591,3 +2631,12 @@ static int __init lapic_insert_resource(void) * that is using request_resource */ late_initcall(lapic_insert_resource); + +static int __init apic_set_disabled_cpu_apicid(char *arg) +{ + if (!arg || !get_option(&arg, &disabled_cpu_apicid)) + return -EINVAL; + + return 0; +} +early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); -- cgit v1.1 From 5b4d1dbc24bb6fd7179ada0f47be34e27e64decb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 15 Jan 2014 13:02:08 -0800 Subject: x86, apic: Make disabled_cpu_apicid static read_mostly, fix typos Make disabled_cpu_apicid static and read_mostly, and fix a couple of typos. Reported-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140115182511.GA22737@gmail.com Signed-off-by: H. Peter Anvin Cc: HATAYAMA Daisuke --- arch/x86/kernel/apic/apic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e78ab8c..7f26c9a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -79,7 +79,7 @@ physid_mask_t phys_cpu_present_map; * disable_cpu_apicid=, mostly used for the kdump 2nd kernel to * avoid undefined behaviour caused by sending INIT from AP to BSP. */ -unsigned int disabled_cpu_apicid = BAD_APICID; +static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; /* * Map cpu index to physical APIC ID @@ -2124,7 +2124,7 @@ int generic_processor_info(int apicid, int version) * boot_cpu_physical_apicid is designed to have the apicid * returned by read_apic_id(), i.e, the apicid of the * currently booting-up processor. However, on some platforms, - * it is temporarilly modified by the apicid reported as BSP + * it is temporarily modified by the apicid reported as BSP * through MP table. Concretely: * * - arch/x86/kernel/mpparse.c: MP_processor_info() @@ -2145,7 +2145,7 @@ int generic_processor_info(int apicid, int version) disabled_cpu_apicid == apicid) { int thiscpu = num_processors + disabled_cpus; - pr_warning("ACPI: Disabling requested cpu." + pr_warning("APIC: Disabling requested cpu." " Processor %d/0x%x ignored.\n", thiscpu, apicid); -- cgit v1.1 From ecd6910db979bc40ac19f0e71e027132fc906068 Mon Sep 17 00:00:00 2001 From: David Cohen Date: Mon, 16 Dec 2013 12:07:36 -0800 Subject: x86, intel-mid: Move Medfield code out of intel-mid.c core file In order make the driver more portable and support other Intel MID (Mobile Internet Device) platforms we need to move Medfield code from intel-mid.c core to its own mfld.c file. This patch contains no functional changes. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1387224459-25746-2-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: H. Peter Anvin --- arch/x86/platform/intel-mid/Makefile | 4 +- arch/x86/platform/intel-mid/intel-mid.c | 37 +--------------- arch/x86/platform/intel-mid/intel_mid_weak_decls.h | 15 +++++++ arch/x86/platform/intel-mid/mfld.c | 51 ++++++++++++++++++++++ 4 files changed, 70 insertions(+), 37 deletions(-) create mode 100644 arch/x86/platform/intel-mid/intel_mid_weak_decls.h create mode 100644 arch/x86/platform/intel-mid/mfld.c (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index 01cc29e..78a14ba 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,6 +1,6 @@ -obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o -obj-$(CONFIG_X86_INTEL_MID) += intel_mid_vrtc.o +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o + # SFI specific code ifdef CONFIG_X86_INTEL_MID obj-$(CONFIG_SFI) += sfi.o device_libs/ diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index f90e290..527d6d5 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -35,6 +35,8 @@ #include #include +#include "intel_mid_weak_decls.h" + /* * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, * cmdline option x86_intel_mid_timer can be used to override the configuration @@ -61,46 +63,11 @@ enum intel_mid_timer_options intel_mid_timer_options; enum intel_mid_cpu_type __intel_mid_cpu_chip; EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); -static void intel_mid_power_off(void) -{ -} - static void intel_mid_reboot(void) { intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); } -static unsigned long __init intel_mid_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = PENWELL_FSB_FREQ_83SKU; - else - fsb = PENWELL_FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - - if (fast_calibrate) - return fast_calibrate; - - return 0; -} - static void __init intel_mid_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h new file mode 100644 index 0000000..519beb7 --- /dev/null +++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h @@ -0,0 +1,15 @@ +/* + * intel_mid_weak_decls.h: Weak declarations of intel-mid.c + * + * (C) Copyright 2013 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + + +/* __attribute__((weak)) makes these declarations overridable */ +extern void intel_mid_power_off(void) __attribute__((weak)); +extern unsigned long __init intel_mid_calibrate_tsc(void) __attribute__((weak)); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c new file mode 100644 index 0000000..c7ff83c --- /dev/null +++ b/arch/x86/platform/intel-mid/mfld.c @@ -0,0 +1,51 @@ +/* + * mfld.c: Intel Medfield platform setup code + * + * (C) Copyright 2013 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include + +#include +#include +#include + +void intel_mid_power_off(void) +{ +} + +unsigned long __init intel_mid_calibrate_tsc(void) +{ + unsigned long fast_calibrate; + u32 lo, hi, ratio, fsb; + + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); + ratio = (hi >> 8) & 0x1f; + pr_debug("ratio is %d\n", ratio); + if (!ratio) { + pr_err("read a zero ratio, should be incorrect!\n"); + pr_err("force tsc ratio to 16 ...\n"); + ratio = 16; + } + rdmsr(MSR_FSB_FREQ, lo, hi); + if ((lo & 0x7) == 0x7) + fsb = PENWELL_FSB_FREQ_83SKU; + else + fsb = PENWELL_FSB_FREQ_100SKU; + fast_calibrate = ratio * fsb; + pr_debug("read penwell tsc %lu khz\n", fast_calibrate); + lapic_timer_frequency = fsb * 1000 / HZ; + /* mark tsc clocksource as reliable */ + set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} -- cgit v1.1 From 85611e3febe78955a519f5f9eb47b941525c8c76 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Mon, 16 Dec 2013 12:07:37 -0800 Subject: x86, intel-mid: Add Clovertrail platform support This patch adds Clovertrail support on intel-mid and makes it more flexible to support other SoCs. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1387224459-25746-3-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Fei Yang Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/intel-mid.h | 46 +++++++++++++++++++++- arch/x86/platform/intel-mid/intel-mid.c | 39 ++++++++++++++++-- arch/x86/platform/intel-mid/intel_mid_weak_decls.h | 7 +++- arch/x86/platform/intel-mid/mfld.c | 32 +++++++++++++-- 4 files changed, 113 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 459769d..f8a8314 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -51,10 +51,39 @@ struct devs_id { enum intel_mid_cpu_type { /* 1 was Moorestown */ INTEL_MID_CPU_CHIP_PENWELL = 2, + INTEL_MID_CPU_CHIP_CLOVERVIEW, }; extern enum intel_mid_cpu_type __intel_mid_cpu_chip; +/** + * struct intel_mid_ops - Interface between intel-mid & sub archs + * @arch_setup: arch_setup function to re-initialize platform + * structures (x86_init, x86_platform_init) + * + * This structure can be extended if any new interface is required + * between intel-mid & its sub arch files. + */ +struct intel_mid_ops { + void (*arch_setup)(void); +}; + +/* Helper API's for INTEL_MID_OPS_INIT */ +#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \ + [cpuid] = get_##cpuname##_ops + +/* Maximum number of CPU ops */ +#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *)) + +/* + * For every new cpu addition, a weak get__ops() function needs be + * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h. + */ +#define INTEL_MID_OPS_INIT {\ + DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ + DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ +}; + #ifdef CONFIG_X86_INTEL_MID static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) @@ -86,8 +115,21 @@ extern enum intel_mid_timer_options intel_mid_timer_options; * Penwell uses spread spectrum clock, so the freq number is not exactly * the same as reported by MSR based on SDM. */ -#define PENWELL_FSB_FREQ_83SKU 83200 -#define PENWELL_FSB_FREQ_100SKU 99840 +#define FSB_FREQ_83SKU 83200 +#define FSB_FREQ_100SKU 99840 +#define FSB_FREQ_133SKU 133000 + +#define FSB_FREQ_167SKU 167000 +#define FSB_FREQ_200SKU 200000 +#define FSB_FREQ_267SKU 267000 +#define FSB_FREQ_333SKU 333000 +#define FSB_FREQ_400SKU 400000 + +/* Bus Select SoC Fuse value */ +#define BSEL_SOC_FUSE_MASK 0x7 +#define BSEL_SOC_FUSE_001 0x1 /* FSB 133MHz */ +#define BSEL_SOC_FUSE_101 0x5 /* FSB 100MHz */ +#define BSEL_SOC_FUSE_111 0x7 /* FSB 83MHz */ #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 527d6d5..4095584 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -60,14 +60,27 @@ enum intel_mid_timer_options intel_mid_timer_options; +/* intel_mid_ops to store sub arch ops */ +struct intel_mid_ops *intel_mid_ops; +/* getter function for sub arch ops*/ +static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT; enum intel_mid_cpu_type __intel_mid_cpu_chip; EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip); +static void intel_mid_power_off(void) +{ +}; + static void intel_mid_reboot(void) { intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); } +static unsigned long __init intel_mid_calibrate_tsc(void) +{ + return 0; +} + static void __init intel_mid_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); @@ -92,13 +105,33 @@ static void __init intel_mid_time_init(void) static void intel_mid_arch_setup(void) { - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) - __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; - else { + if (boot_cpu_data.x86 != 6) { pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", boot_cpu_data.x86, boot_cpu_data.x86_model); __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; + goto out; + } + + switch (boot_cpu_data.x86_model) { + case 0x35: + __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW; + break; + case 0x27: + default: + __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; + break; } + + if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops)) + intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip](); + else { + intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL](); + pr_info("ARCH: Uknown SoC, assuming PENWELL!\n"); + } + +out: + if (intel_mid_ops->arch_setup) + intel_mid_ops->arch_setup(); } /* MID systems don't have i8042 controller */ diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h index 519beb7..9ebce044 100644 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h @@ -11,5 +11,8 @@ /* __attribute__((weak)) makes these declarations overridable */ -extern void intel_mid_power_off(void) __attribute__((weak)); -extern unsigned long __init intel_mid_calibrate_tsc(void) __attribute__((weak)); +/* For every CPU addition a new get__ops interface needs + * to be added. + */ +extern void * __cpuinit get_penwell_ops(void) __attribute__((weak)); +extern void * __cpuinit get_cloverview_ops(void) __attribute__((weak)); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index c7ff83c..4f7884e 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -15,11 +15,19 @@ #include #include -void intel_mid_power_off(void) +#include "intel_mid_weak_decls.h" + +static void penwell_arch_setup(void); +/* penwell arch ops */ +static struct intel_mid_ops penwell_ops = { + .arch_setup = penwell_arch_setup, +}; + +static void mfld_power_off(void) { } -unsigned long __init intel_mid_calibrate_tsc(void) +static unsigned long __init mfld_calibrate_tsc(void) { unsigned long fast_calibrate; u32 lo, hi, ratio, fsb; @@ -35,9 +43,9 @@ unsigned long __init intel_mid_calibrate_tsc(void) } rdmsr(MSR_FSB_FREQ, lo, hi); if ((lo & 0x7) == 0x7) - fsb = PENWELL_FSB_FREQ_83SKU; + fsb = FSB_FREQ_83SKU; else - fsb = PENWELL_FSB_FREQ_100SKU; + fsb = FSB_FREQ_100SKU; fast_calibrate = ratio * fsb; pr_debug("read penwell tsc %lu khz\n", fast_calibrate); lapic_timer_frequency = fsb * 1000 / HZ; @@ -49,3 +57,19 @@ unsigned long __init intel_mid_calibrate_tsc(void) return 0; } + +static void __init penwell_arch_setup() +{ + x86_platform.calibrate_tsc = mfld_calibrate_tsc; + pm_power_off = mfld_power_off; +} + +void * __cpuinit get_penwell_ops() +{ + return &penwell_ops; +} + +void * __cpuinit get_cloverview_ops() +{ + return &penwell_ops; +} -- cgit v1.1 From bc20aa48bbb3068224a1c91f8332971fdb689fad Mon Sep 17 00:00:00 2001 From: David Cohen Date: Mon, 16 Dec 2013 12:07:38 -0800 Subject: x86, intel-mid: Add Merrifield platform support This code was partially based on Mark Brown's previous work. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1387224459-25746-4-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: Fei Yang Cc: Mark F. Brown Cc: Kuppuswamy Sathyanarayanan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/intel-mid.h | 2 + arch/x86/pci/intel_mid_pci.c | 6 +- arch/x86/platform/intel-mid/Makefile | 2 +- arch/x86/platform/intel-mid/intel-mid.c | 4 + arch/x86/platform/intel-mid/intel_mid_weak_decls.h | 1 + arch/x86/platform/intel-mid/mrfl.c | 103 +++++++++++++++++++++ arch/x86/platform/intel-mid/sfi.c | 34 +++++-- 7 files changed, 144 insertions(+), 8 deletions(-) create mode 100644 arch/x86/platform/intel-mid/mrfl.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index f8a8314..e34e097 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -52,6 +52,7 @@ enum intel_mid_cpu_type { /* 1 was Moorestown */ INTEL_MID_CPU_CHIP_PENWELL = 2, INTEL_MID_CPU_CHIP_CLOVERVIEW, + INTEL_MID_CPU_CHIP_TANGIER, }; extern enum intel_mid_cpu_type __intel_mid_cpu_chip; @@ -82,6 +83,7 @@ struct intel_mid_ops { #define INTEL_MID_OPS_INIT {\ DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ + DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \ }; #ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 51384ca..84b9d67 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -31,6 +31,7 @@ #include #include #include +#include #define PCIE_CAP_OFFSET 0x100 @@ -219,7 +220,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) irq_attr.ioapic = mp_find_ioapic(dev->irq); irq_attr.ioapic_pin = dev->irq; irq_attr.trigger = 1; /* level */ - irq_attr.polarity = 1; /* active low */ + if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) + irq_attr.polarity = 0; /* active high */ + else + irq_attr.polarity = 1; /* active low */ io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); return 0; diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index 78a14ba..0a8ee70 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o +obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o # SFI specific code diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 4095584..1bbedc4 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -116,6 +116,10 @@ static void intel_mid_arch_setup(void) case 0x35: __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW; break; + case 0x3C: + case 0x4A: + __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER; + break; case 0x27: default: __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL; diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h index 9ebce044..a537ffc 100644 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h @@ -16,3 +16,4 @@ */ extern void * __cpuinit get_penwell_ops(void) __attribute__((weak)); extern void * __cpuinit get_cloverview_ops(void) __attribute__((weak)); +extern void * __init get_tangier_ops(void) __attribute__((weak)); diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c new file mode 100644 index 0000000..09d1015 --- /dev/null +++ b/arch/x86/platform/intel-mid/mrfl.c @@ -0,0 +1,103 @@ +/* + * mrfl.c: Intel Merrifield platform specific setup code + * + * (C) Copyright 2013 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include + +#include +#include + +#include "intel_mid_weak_decls.h" + +static unsigned long __init tangier_calibrate_tsc(void) +{ + unsigned long fast_calibrate; + u32 lo, hi, ratio, fsb, bus_freq; + + /* *********************** */ + /* Compute TSC:Ratio * FSB */ + /* *********************** */ + + /* Compute Ratio */ + rdmsr(MSR_PLATFORM_INFO, lo, hi); + pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo); + + ratio = (lo >> 8) & 0xFF; + pr_debug("ratio is %d\n", ratio); + if (!ratio) { + pr_err("Read a zero ratio, force tsc ratio to 4 ...\n"); + ratio = 4; + } + + /* Compute FSB */ + rdmsr(MSR_FSB_FREQ, lo, hi); + pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n", + hi, lo); + + bus_freq = lo & 0x7; + pr_debug("bus_freq = 0x%x\n", bus_freq); + + if (bus_freq == 0) + fsb = FSB_FREQ_100SKU; + else if (bus_freq == 1) + fsb = FSB_FREQ_100SKU; + else if (bus_freq == 2) + fsb = FSB_FREQ_133SKU; + else if (bus_freq == 3) + fsb = FSB_FREQ_167SKU; + else if (bus_freq == 4) + fsb = FSB_FREQ_83SKU; + else if (bus_freq == 5) + fsb = FSB_FREQ_400SKU; + else if (bus_freq == 6) + fsb = FSB_FREQ_267SKU; + else if (bus_freq == 7) + fsb = FSB_FREQ_333SKU; + else { + BUG(); + pr_err("Invalid bus_freq! Setting to minimal value!\n"); + fsb = FSB_FREQ_100SKU; + } + + /* TSC = FSB Freq * Resolved HFM Ratio */ + fast_calibrate = ratio * fsb; + pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate); + + /* ************************************ */ + /* Calculate Local APIC Timer Frequency */ + /* ************************************ */ + lapic_timer_frequency = (fsb * 1000) / HZ; + + pr_debug("Setting lapic_timer_frequency = %d\n", + lapic_timer_frequency); + + /* mark tsc clocksource as reliable */ + set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} + +static void __init tangier_arch_setup(void) +{ + x86_platform.calibrate_tsc = tangier_calibrate_tsc; +} + +/* tangier arch ops */ +static struct intel_mid_ops tangier_ops = { + .arch_setup = tangier_arch_setup, +}; + +void * __cpuinit get_tangier_ops() +{ + return &tangier_ops; +} diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index c84c1ca..80a5228 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -443,13 +443,35 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) * so we have to enable them one by one here */ ioapic = mp_find_ioapic(irq); - irq_attr.ioapic = ioapic; - irq_attr.ioapic_pin = irq; - irq_attr.trigger = 1; - irq_attr.polarity = 1; - io_apic_set_pci_routing(NULL, irq, &irq_attr); - } else + if (ioapic >= 0) { + irq_attr.ioapic = ioapic; + irq_attr.ioapic_pin = irq; + irq_attr.trigger = 1; + if (intel_mid_identify_cpu() == + INTEL_MID_CPU_CHIP_TANGIER) { + if (!strncmp(pentry->name, + "r69001-ts-i2c", 13)) + /* active low */ + irq_attr.polarity = 1; + else if (!strncmp(pentry->name, + "synaptics_3202", 14)) + /* active low */ + irq_attr.polarity = 1; + else if (irq == 41) + /* fast_int_1 */ + irq_attr.polarity = 1; + else + /* active high */ + irq_attr.polarity = 0; + } else { + /* PNW and CLV go with active low */ + irq_attr.polarity = 1; + } + io_apic_set_pci_routing(NULL, irq, &irq_attr); + } + } else { irq = 0; /* No irq */ + } dev = get_device_id(pentry->type, pentry->name); -- cgit v1.1 From 4cb9b00f42e07830310319a07e6c91413ee8153e Mon Sep 17 00:00:00 2001 From: David Cohen Date: Mon, 16 Dec 2013 17:37:26 -0800 Subject: x86, intel-mid: Remove deprecated X86_MDFLD and X86_WANT_INTEL_MID configs We want to support all Intel MID (Mobile Internet Device) platforms with a single config selection. This patch removes deprecated CONFIG_X86_MDFLD and X86_WANT_INTEL_MID options in favor of having CONFIG_X86_INTEL_MID only. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1387244246-20714-1-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0952ecd..edcf80e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -438,42 +438,26 @@ config X86_INTEL_CE This option compiles in support for the CE4100 SOC for settop boxes and media devices. -config X86_WANT_INTEL_MID +config X86_INTEL_MID bool "Intel MID platform support" depends on X86_32 depends on X86_EXTENDED_PLATFORM - ---help--- - Select to build a kernel capable of supporting Intel MID platform - systems which do not have the PCI legacy interfaces (Moorestown, - Medfield). If you are building for a PC class system say N here. - -if X86_WANT_INTEL_MID - -config X86_INTEL_MID - bool - -config X86_MDFLD - bool "Medfield MID platform" depends on PCI depends on PCI_GOANY depends on X86_IO_APIC - select X86_INTEL_MID select SFI + select I2C select DW_APB_TIMER select APB_TIMER - select I2C - select SPI select INTEL_SCU_IPC - select X86_PLATFORM_DEVICES select MFD_INTEL_MSIC ---help--- - Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin - Internet Device(MID) platform. - Unlike standard x86 PCs, Medfield does not have many legacy devices - nor standard legacy replacement devices/features. e.g. Medfield does - not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. + Select to build a kernel capable of supporting Intel MID (Mobile + Internet Device) platform systems which do not have the PCI legacy + interfaces. If you are building for a PC class system say N here. -endif + Intel MID platforms are based on an Intel processor and chipset which + consume less power than most of the x86 derivatives. config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" -- cgit v1.1 From aee636c4809fa54848ff07a899b326eb1f9987a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Jan 2014 06:50:07 -0800 Subject: bpf: do not use reciprocal divide At first Jakub Zawadzki noticed that some divisions by reciprocal_divide were not correct. (off by one in some cases) http://www.wireshark.org/~darkjames/reciprocal-buggy.c He could also show this with BPF: http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c The reciprocal divide in linux kernel is not generic enough, lets remove its use in BPF, as it is not worth the pain with current cpus. Signed-off-by: Eric Dumazet Reported-by: Jakub Zawadzki Cc: Mircea Gherzan Cc: Daniel Borkmann Cc: Hannes Frederic Sowa Cc: Matt Evans Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 26328e8..4ed75dd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -359,15 +359,21 @@ void bpf_jit_compile(struct sk_filter *fp) EMIT2(0x89, 0xd0); /* mov %edx,%eax */ break; case BPF_S_ALU_MOD_K: /* A %= K; */ + if (K == 1) { + CLEAR_A(); + break; + } EMIT2(0x31, 0xd2); /* xor %edx,%edx */ EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ EMIT2(0xf7, 0xf1); /* div %ecx */ EMIT2(0x89, 0xd0); /* mov %edx,%eax */ break; - case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ - EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ - EMIT(K, 4); - EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */ + case BPF_S_ALU_DIV_K: /* A /= K */ + if (K == 1) + break; + EMIT2(0x31, 0xd2); /* xor %edx,%edx */ + EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ + EMIT2(0xf7, 0xf1); /* div %ecx */ break; case BPF_S_ALU_AND_X: seen |= SEEN_XREG; -- cgit v1.1 From 7da7c1561366ba8adb7275464ab44e84e1faa7e0 Mon Sep 17 00:00:00 2001 From: Bin Gao Date: Mon, 21 Oct 2013 09:16:33 -0700 Subject: x86, tsc: Add static (MSR) TSC calibration on Intel Atom SoCs On SoCs that have the calibration MSRs available, either there is no PIT, HPET or PMTIMER to calibrate against, or the PIT/HPET/PMTIMER is driven from the same clock as the TSC, so calibration is redundant and just slows down the boot. TSC rate is caculated by this formula: * The ratio and the resolved frequency ID can be obtained from MSR. See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 for details. Signed-off-by: Bin Gao Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-rgm7xmg7k6qnjlw3ynkcjsmh@git.kernel.org --- arch/x86/include/asm/tsc.h | 3 ++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/tsc.c | 10 ++++ arch/x86/kernel/tsc_msr.c | 125 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/tsc_msr.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 235be70..57ae63c 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -65,4 +65,7 @@ extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); +/* MSR based TSC calibration for Intel Atom SoC platforms */ +int try_msr_calibrate_tsc(unsigned long *fast_calibrate); + #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index dbe9bd6..6dbbb1e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -32,7 +32,7 @@ obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o -obj-y += tsc.o io_delay.o rtc.o +obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d4..e574716 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -419,6 +419,16 @@ unsigned long native_calibrate_tsc(void) unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; + /* Calibrate TSC using MSR for Intel Atom SoCs */ + local_irq_save(flags); + i = try_msr_calibrate_tsc(&fast_calibrate); + local_irq_restore(flags); + if (i >= 0) { + if (i == 0) + pr_warn("Fast TSC calibration using MSR failed\n"); + return fast_calibrate; + } + local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c new file mode 100644 index 0000000..c502772 --- /dev/null +++ b/arch/x86/kernel/tsc_msr.c @@ -0,0 +1,125 @@ +/* + * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. + * + * TSC in Intel Atom SoC runs at a constant rate which can be figured + * by this formula: + * * + * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 + * for details. + * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR + * based calibration is the only option. + * + * + * Copyright (C) 2013 Intel Corporation + * Author: Bin Gao + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include + +/* CPU reference clock frequency: in KHz */ +#define FREQ_83 83200 +#define FREQ_100 99840 +#define FREQ_133 133200 +#define FREQ_166 166400 + +#define MAX_NUM_FREQS 8 + +/* + * According to Intel 64 and IA-32 System Programming Guide, + * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. + * Unfortunately some Intel Atom SoCs aren't quite compliant to this, + * so we need manually differentiate SoC families. This is what the + * field msr_plat does. + */ +struct freq_desc { + u8 x86_family; /* CPU family */ + u8 x86_model; /* model */ + u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ + u32 freqs[MAX_NUM_FREQS]; +}; + +static struct freq_desc freq_desc_tables[] = { + /* PNW */ + { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + /* CLV+ */ + { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + /* TNG */ + { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, + /* VLV2 */ + { 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, + /* ANN */ + { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, +}; + +static int match_cpu(u8 family, u8 model) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { + if ((family == freq_desc_tables[i].x86_family) && + (model == freq_desc_tables[i].x86_model)) + return i; + } + + return -1; +} + +/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ +#define id_to_freq(cpu_index, freq_id) \ + (freq_desc_tables[cpu_index].freqs[freq_id]) + +/* + * Do MSR calibration only for known/supported CPUs. + * Return values: + * -1: CPU is unknown/unsupported for MSR based calibration + * 0: CPU is known/supported, but calibration failed + * 1: CPU is known/supported, and calibration succeeded + */ +int try_msr_calibrate_tsc(unsigned long *fast_calibrate) +{ + int cpu_index; + u32 lo, hi, ratio, freq_id, freq; + + cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); + if (cpu_index < 0) + return -1; + + *fast_calibrate = 0; + + if (freq_desc_tables[cpu_index].msr_plat) { + rdmsr(MSR_PLATFORM_INFO, lo, hi); + ratio = (lo >> 8) & 0x1f; + } else { + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + ratio = (hi >> 8) & 0x1f; + } + pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); + + if (!ratio) + return 0; + + /* Get FSB FREQ ID */ + rdmsr(MSR_FSB_FREQ, lo, hi); + freq_id = lo & 0x7; + freq = id_to_freq(cpu_index, freq_id); + pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", + freq_id, freq); + if (!freq) + return 0; + + /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ + *fast_calibrate = freq * ratio; + pr_info("TSC runs at %lu KHz\n", *fast_calibrate); + + lapic_timer_frequency = (freq * 1000) / HZ; + pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); + + return 1; +} -- cgit v1.1 From c026b3591e4f2a4993df773183704bb31634e0bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jan 2014 21:06:03 +0100 Subject: x86, mm, perf: Allow recursive faults from interrupts Waiman managed to trigger a PMI while in a emulate_vsyscall() fault, the PMI in turn managed to trigger a fault while obtaining a stack trace. This triggered the sig_on_uaccess_error recursive fault logic and killed the process dead. Fix this by explicitly excluding interrupts from the recursive fault logic. Reported-and-Tested-by: Waiman Long Fixes: e00b12e64be9 ("perf/x86: Further optimize copy_from_user_nmi()") Cc: Aswin Chandramouleeswaran Cc: Scott J Norton Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Andrew Morton Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140110200603.GJ7572@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9ff85bb..9d591c8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -641,6 +641,20 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ if (current_thread_info()->sig_on_uaccess_error && signal) { tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; @@ -649,6 +663,10 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, tsk, 0); } + + /* + * Barring that, we can do the fixup and be happy. + */ return; } -- cgit v1.1 From bee09ed91cacdbffdbcd3b05de8409c77ec9fcd6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 15 Jan 2014 15:57:29 +0100 Subject: perf/x86/amd/ibs: Fix waking up from S3 for AMD family 10h On AMD family 10h we see following error messages while waking up from S3 for all non-boot CPUs leading to a failed IBS initialization: Enabling non-boot CPUs ... smpboot: Booting Node 0 Processor 1 APIC 0x1 [Firmware Bug]: cpu 1, try to use APIC500 (LVT offset 0) for vector 0x400, but the register is already in use for vector 0xf9 on another cpu perf: IBS APIC setup failed on cpu #1 process: Switch to broadcast mode on CPU1 CPU1 is up ... ACPI: Waking up from system sleep state S3 Reason for this is that during suspend the LVT offset for the IBS vector gets lost and needs to be reinialized while resuming. The offset is read from the IBSCTL msr. On family 10h the offset needs to be 1 as offset 0 is used for the MCE threshold interrupt, but firmware assings it for IBS to 0 too. The kernel needs to reprogram the vector. The msr is a readonly node msr, but a new value can be written via pci config space access. The reinitialization is implemented for family 10h in setup_ibs_ctl() which is forced during IBS setup. This patch fixes IBS setup after waking up from S3 by adding resume/supend hooks for the boot cpu which does the offset reinitialization. Marking it as stable to let distros pick up this fix. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Cc: v3.2.. Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1389797849-5565-1-git-send-email-rric.net@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 53 +++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index e09f0bf..4b8e4d3 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -816,6 +817,18 @@ out: return ret; } +static void ibs_eilvt_setup(void) +{ + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); +} + static inline int get_ibs_lvt_offset(void) { u64 val; @@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy) setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); } +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ + clear_APIC_ibs(NULL); + return 0; +} + +static void perf_ibs_resume(void) +{ + ibs_eilvt_setup(); + setup_APIC_ibs(NULL); +} + +static struct syscore_ops perf_ibs_syscore_ops = { + .resume = perf_ibs_resume, + .suspend = perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ + register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif + static int perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -877,18 +920,12 @@ static __init int amd_ibs_init(void) if (!caps) return -ENODEV; /* ibs not supported by the cpu */ - /* - * Force LVT offset assignment for family 10h: The offsets are - * not assigned by the BIOS for this family, so the OS is - * responsible for doing it. If the OS assignment fails, fall - * back to BIOS settings and try to setup this. - */ - if (boot_cpu_data.x86 == 0x10) - force_ibs_eilvt_setup(); + ibs_eilvt_setup(); if (!ibs_eilvt_valid()) goto out; + perf_ibs_pm_init(); get_online_cpus(); ibs_caps = caps; /* make ibs_caps visible to other cpus: */ -- cgit v1.1 From ca1e631c3acf80bc5f5934ce9054a9b4880c96e1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 16 Jan 2014 13:00:21 -0800 Subject: x86, tsc, apic: Unbreak static (MSR) calibration when CONFIG_X86_LOCAL_APIC=n If we aren't going to use the local APIC anyway, we obviously don't care about its timer frequency. Link: http://lkml.kernel.org/r/tip-rgm7xmg7k6qnjlw3ynkcjsmh@git.kernel.org Reported-by: Fengguang Wu Cc: Bin Gao Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tsc_msr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index c502772..8b5434f 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -118,8 +118,10 @@ int try_msr_calibrate_tsc(unsigned long *fast_calibrate) *fast_calibrate = freq * ratio; pr_info("TSC runs at %lu KHz\n", *fast_calibrate); +#ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); +#endif return 1; } -- cgit v1.1 From acb20d7395f75b0fdb5b300f9fb3b9c6054fb04f Mon Sep 17 00:00:00 2001 From: David Cohen Date: Thu, 16 Jan 2014 15:07:02 -0800 Subject: x86, intel-mid: sfi_handle_*_dev() should check for pdata error code When Intel MID finds a match between SFI table from FW and registered SFI devices, it will always register a device regardless the platform code was successful or not. This patch adds an extra option for platform code to return error code and abort device registration on SFI table parsing. This patch does not contain any functional changes for current intel mid platform code. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1389913624-9149-2-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/intel-mid/sfi.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 80a5228..1d5d20b 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -337,6 +337,8 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry, pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n", pentry->name, pentry->irq); pdata = intel_mid_sfi_get_pdata(dev, pentry); + if (IS_ERR(pdata)) + return; pdev = platform_device_alloc(pentry->name, 0); if (pdev == NULL) { @@ -370,6 +372,8 @@ static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry, spi_info.chip_select); pdata = intel_mid_sfi_get_pdata(dev, &spi_info); + if (IS_ERR(pdata)) + return; spi_info.platform_data = pdata; if (dev->delay) @@ -395,6 +399,8 @@ static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry, i2c_info.addr); pdata = intel_mid_sfi_get_pdata(dev, &i2c_info); i2c_info.platform_data = pdata; + if (IS_ERR(pdata)) + return; if (dev->delay) intel_scu_i2c_device_register(pentry->host_num, &i2c_info); -- cgit v1.1 From a957a14bb4ca976cbaaff3594ef5b8f8f7f65804 Mon Sep 17 00:00:00 2001 From: David Cohen Date: Thu, 16 Jan 2014 15:07:03 -0800 Subject: x86, intel-mid: Check get_gpio_by_name() error code on platform code This patch does cleanup on all intel mid platform code that uses gpio_get_by_name() function. From now on they should check for any error code instead of only hardcoded -1. There are no functional changes from this change. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1389913624-9149-3-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/intel-mid/device_libs/platform_emc1403.c | 4 +++- arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c | 2 +- arch/x86/platform/intel-mid/device_libs/platform_lis331.c | 4 +++- arch/x86/platform/intel-mid/device_libs/platform_max7315.c | 2 +- arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c | 2 +- arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c | 2 +- arch/x86/platform/intel-mid/device_libs/platform_tca6416.c | 4 ++-- 7 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c index 0d942c1..69a7836 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c @@ -22,7 +22,9 @@ static void __init *emc1403_platform_data(void *info) int intr = get_gpio_by_name("thermal_int"); int intr2nd = get_gpio_by_name("thermal_alert"); - if (intr == -1 || intr2nd == -1) + if (intr < 0) + return NULL; + if (intr2nd < 0) return NULL; i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c index a013a48..dccae6b 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c @@ -66,7 +66,7 @@ static int __init pb_keys_init(void) gb[i].gpio = get_gpio_by_name(gb[i].desc); pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio); - if (gb[i].gpio == -1) + if (gb[i].gpio < 0) continue; if (i != good) diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c index 15278c1..54226de 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c @@ -21,7 +21,9 @@ static void __init *lis331dl_platform_data(void *info) int intr = get_gpio_by_name("accel_int"); int intr2nd = get_gpio_by_name("accel_2"); - if (intr == -1 || intr2nd == -1) + if (intr < 0) + return NULL; + if (intr2nd < 0) return NULL; i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c index 94ade10..2c8acbc 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c @@ -48,7 +48,7 @@ static void __init *max7315_platform_data(void *info) gpio_base = get_gpio_by_name(base_pin_name); intr = get_gpio_by_name(intr_pin_name); - if (gpio_base == -1) + if (gpio_base < 0) return NULL; max7315->gpio_base = gpio_base; if (intr != -1) { diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c index dd28d63..cfe9a47 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c @@ -19,7 +19,7 @@ static void *mpu3050_platform_data(void *info) struct i2c_board_info *i2c_info = info; int intr = get_gpio_by_name("mpu3050_int"); - if (intr == -1) + if (intr < 0) return NULL; i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c index d87182a..65c2a9a 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c @@ -26,7 +26,7 @@ static void __init *pmic_gpio_platform_data(void *info) static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; int gpio_base = get_gpio_by_name("pmic_gpio_base"); - if (gpio_base == -1) + if (gpio_base < 0) gpio_base = 64; pmic_gpio_pdata.gpio_base = gpio_base; pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c index 22881c9..33be0b3 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c @@ -34,10 +34,10 @@ static void *tca6416_platform_data(void *info) gpio_base = get_gpio_by_name(base_pin_name); intr = get_gpio_by_name(intr_pin_name); - if (gpio_base == -1) + if (gpio_base < 0) return NULL; tca6416.gpio_base = gpio_base; - if (intr != -1) { + if (intr >= 0) { i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET; tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET; } else { -- cgit v1.1 From 28c6a39b3367f29462cd586785dc445cd6b5ac23 Mon Sep 17 00:00:00 2001 From: David Cohen Date: Thu, 16 Jan 2014 15:07:04 -0800 Subject: x86, intel-mid: Return proper error code from get_gpio_by_name() This patch cleans up get_gpio_by_name() to return an error code instead of hardcoded -1. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1389913624-9149-4-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/intel-mid/sfi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 1d5d20b..438306e 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -224,7 +224,7 @@ int get_gpio_by_name(const char *name) if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) return pentry->pin_no; } - return -1; + return -EINVAL; } void __init intel_scu_device_register(struct platform_device *pdev) -- cgit v1.1 From ee87c751d88f9b03fee7349556817fe80c113b32 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 16 Jan 2014 16:13:08 -0800 Subject: x86, intel_mid: Replace memcpy with struct assignment This is a cleanup proposed by coccinelle. It replaces memcpy with struct assignment on intel-mid's sfi layer. Generated by: coccinelle/misc/memcpy-assign.cocci Signed-off-by: Fengguang Wu Link: http://lkml.kernel.org/r/1389917588-9785-1-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: David Cohen Signed-off-by: H. Peter Anvin --- arch/x86/platform/intel-mid/sfi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 438306e..994c40b 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -250,7 +250,7 @@ static void __init intel_scu_spi_device_register(struct spi_board_info *sdev) sdev->modalias); return; } - memcpy(new_dev, sdev, sizeof(*sdev)); + *new_dev = *sdev; spi_devs[spi_next_dev++] = new_dev; } @@ -271,7 +271,7 @@ static void __init intel_scu_i2c_device_register(int bus, idev->type); return; } - memcpy(new_dev, idev, sizeof(*idev)); + *new_dev = *idev; i2c_bus[i2c_next_dev] = bus; i2c_devs[i2c_next_dev++] = new_dev; -- cgit v1.1 From fb09b78151361f5001ad462e4b242b10845830e2 Mon Sep 17 00:00:00 2001 From: Qiaowei Ren Date: Sun, 12 Jan 2014 17:20:02 +0800 Subject: x86, mpx: Add MPX related opcodes to the x86 opcode map This patch adds all the MPX instructions to x86 opcode map, so the x86 instruction decoder can decode MPX instructions. Signed-off-by: Qiaowei Ren Link: http://lkml.kernel.org/r/1389518403-7715-4-git-send-email-qiaowei.ren@intel.com Cc: Masami Hiramatsu Signed-off-by: H. Peter Anvin --- arch/x86/lib/x86-opcode-map.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 533a85e..1a2be7c 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -346,8 +346,8 @@ AVXcode: 1 17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 18: Grp16 (1A) 19: -1a: -1b: +1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv +1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv 1c: 1d: 1e: -- cgit v1.1 From 741e3902cd89a7fbc04ae53f29a7ca0da452aa8e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Jan 2014 19:51:05 +0100 Subject: x86/intel/mpx: Remove unused LWP structure We don't support LWP yet, don't give the impression that we do: represent the LWP state as opaque 128 bytes, the way Linux sees it currently. Cc: Qiaowei Ren Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ecarmjtfKpanpAapfck6dj6g@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b7845a1..865c399 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -370,15 +370,9 @@ struct ymmh_struct { u32 ymmh_space[64]; }; +/* We don't support LWP yet: */ struct lwp_struct { - u64 lwpcb_addr; - u32 flags; - u32 buf_head_offset; - u64 buf_base; - u32 buf_size; - u32 filters; - u64 saved_event_record[4]; - u32 event_counter[16]; + u8 reserved[128]; }; struct bndregs_struct { -- cgit v1.1