From d751c169e9a6f0f853346f1184881422bd10b3c2 Mon Sep 17 00:00:00 2001
From: Michael Davidson <md@google.com>
Date: Thu, 10 Oct 2013 18:39:54 -0700
Subject: x86, relocs: Add more per-cpu gold special cases

The "gold" linker doesn't seem to put some additional per-cpu cases in
the right place. Add these to the per-cpu check. Without this, the kASLR
patch series fails to correctly apply relocations, and fails to boot.

Signed-off-by: Michael Davidson <md@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/20131011013954.GA28902@www.outflux.net
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/tools/relocs.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index f7bab68..71a2533 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -722,15 +722,23 @@ static void percpu_init(void)
 
 /*
  * Check to see if a symbol lies in the .data..percpu section.
- * For some as yet not understood reason the "__init_begin"
- * symbol which immediately preceeds the .data..percpu section
- * also shows up as it it were part of it so we do an explict
- * check for that symbol name and ignore it.
+ *
+ * The linker incorrectly associates some symbols with the
+ * .data..percpu section so we also need to check the symbol
+ * name to make sure that we classify the symbol correctly.
+ *
+ * The GNU linker incorrectly associates:
+ *	__init_begin
+ *
+ * The "gold" linker incorrectly associates:
+ *	init_per_cpu__irq_stack_union
+ *	init_per_cpu__gdt_page
  */
 static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
 {
 	return (sym->st_shndx == per_cpu_shndx) &&
-		strcmp(symname, "__init_begin");
+		strcmp(symname, "__init_begin") &&
+		strncmp(symname, "init_per_cpu_", 13);
 }
 
 
-- 
cgit v1.1


From dd78b97367bd575918204cc89107c1479d3fc1a7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 10 Oct 2013 17:18:13 -0700
Subject: x86, boot: Move CPU flags out of cpucheck

Refactor the CPU flags handling out of the cpucheck routines so that
they can be reused by the future ASLR routines (in order to detect CPU
features like RDRAND and RDTSC).

This reworks has_eflag() and has_fpu() to be used on both 32-bit and
64-bit, and refactors the calls to cpuid to make them PIC-safe on 32-bit.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1381450698-28710-2-git-send-email-keescook@chromium.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/Makefile              |   2 +-
 arch/x86/boot/boot.h                |  10 +---
 arch/x86/boot/compressed/Makefile   |   2 +-
 arch/x86/boot/compressed/cpuflags.c |  12 +++++
 arch/x86/boot/cpucheck.c            |  86 -----------------------------
 arch/x86/boot/cpuflags.c            | 104 ++++++++++++++++++++++++++++++++++++
 arch/x86/boot/cpuflags.h            |  19 +++++++
 7 files changed, 138 insertions(+), 97 deletions(-)
 create mode 100644 arch/x86/boot/compressed/cpuflags.c
 create mode 100644 arch/x86/boot/cpuflags.c
 create mode 100644 arch/x86/boot/cpuflags.h

(limited to 'arch/x86')

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 379814b..0da2e37 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -20,7 +20,7 @@ targets		:= vmlinux.bin setup.bin setup.elf bzImage
 targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
-setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
+setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
 setup-y		+= early_serial_console.o edd.o header.o main.o mca.o memory.o
 setup-y		+= pm.o pmjump.o printf.o regs.o string.o tty.o video.o
 setup-y		+= video-mode.o version.o
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index ef72bae..50f8c5e 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -26,9 +26,8 @@
 #include <asm/boot.h>
 #include <asm/setup.h>
 #include "bitops.h"
-#include <asm/cpufeature.h>
-#include <asm/processor-flags.h>
 #include "ctype.h"
+#include "cpuflags.h"
 
 /* Useful macros */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -307,14 +306,7 @@ static inline int cmdline_find_option_bool(const char *option)
 	return __cmdline_find_option_bool(cmd_line_ptr, option);
 }
 
-
 /* cpu.c, cpucheck.c */
-struct cpu_features {
-	int level;		/* Family, or 64 for x86-64 */
-	int model;
-	u32 flags[NCAPINTS];
-};
-extern struct cpu_features cpu;
 int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
 int validate_cpu(void);
 
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index dcd90df..3312f1be 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,7 +27,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include
 
 VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
 	$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
-	$(obj)/piggy.o
+	$(obj)/piggy.o $(obj)/cpuflags.o
 
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
new file mode 100644
index 0000000..931cba6
--- /dev/null
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -0,0 +1,12 @@
+#ifdef CONFIG_RANDOMIZE_BASE
+
+#include "../cpuflags.c"
+
+bool has_cpuflag(int flag)
+{
+	get_flags();
+
+	return test_bit(flag, cpu.flags);
+}
+
+#endif
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4d3ff03..e1f3c16 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -28,8 +28,6 @@
 #include <asm/required-features.h>
 #include <asm/msr-index.h>
 
-struct cpu_features cpu;
-static u32 cpu_vendor[3];
 static u32 err_flags[NCAPINTS];
 
 static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
@@ -69,90 +67,6 @@ static int is_transmeta(void)
 	       cpu_vendor[2] == A32('M', 'x', '8', '6');
 }
 
-static int has_fpu(void)
-{
-	u16 fcw = -1, fsw = -1;
-	u32 cr0;
-
-	asm("movl %%cr0,%0" : "=r" (cr0));
-	if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
-		cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
-		asm volatile("movl %0,%%cr0" : : "r" (cr0));
-	}
-
-	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-		     : "+m" (fsw), "+m" (fcw));
-
-	return fsw == 0 && (fcw & 0x103f) == 0x003f;
-}
-
-static int has_eflag(u32 mask)
-{
-	u32 f0, f1;
-
-	asm("pushfl ; "
-	    "pushfl ; "
-	    "popl %0 ; "
-	    "movl %0,%1 ; "
-	    "xorl %2,%1 ; "
-	    "pushl %1 ; "
-	    "popfl ; "
-	    "pushfl ; "
-	    "popl %1 ; "
-	    "popfl"
-	    : "=&r" (f0), "=&r" (f1)
-	    : "ri" (mask));
-
-	return !!((f0^f1) & mask);
-}
-
-static void get_flags(void)
-{
-	u32 max_intel_level, max_amd_level;
-	u32 tfms;
-
-	if (has_fpu())
-		set_bit(X86_FEATURE_FPU, cpu.flags);
-
-	if (has_eflag(X86_EFLAGS_ID)) {
-		asm("cpuid"
-		    : "=a" (max_intel_level),
-		      "=b" (cpu_vendor[0]),
-		      "=d" (cpu_vendor[1]),
-		      "=c" (cpu_vendor[2])
-		    : "a" (0));
-
-		if (max_intel_level >= 0x00000001 &&
-		    max_intel_level <= 0x0000ffff) {
-			asm("cpuid"
-			    : "=a" (tfms),
-			      "=c" (cpu.flags[4]),
-			      "=d" (cpu.flags[0])
-			    : "a" (0x00000001)
-			    : "ebx");
-			cpu.level = (tfms >> 8) & 15;
-			cpu.model = (tfms >> 4) & 15;
-			if (cpu.level >= 6)
-				cpu.model += ((tfms >> 16) & 0xf) << 4;
-		}
-
-		asm("cpuid"
-		    : "=a" (max_amd_level)
-		    : "a" (0x80000000)
-		    : "ebx", "ecx", "edx");
-
-		if (max_amd_level >= 0x80000001 &&
-		    max_amd_level <= 0x8000ffff) {
-			u32 eax = 0x80000001;
-			asm("cpuid"
-			    : "+a" (eax),
-			      "=c" (cpu.flags[6]),
-			      "=d" (cpu.flags[1])
-			    : : "ebx");
-		}
-	}
-}
-
 /* Returns a bitmask of which words we have error bits in */
 static int check_flags(void)
 {
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
new file mode 100644
index 0000000..b02544a
--- /dev/null
+++ b/arch/x86/boot/cpuflags.c
@@ -0,0 +1,104 @@
+#include <linux/types.h>
+#include "bitops.h"
+
+#include <asm/processor-flags.h>
+#include <asm/required-features.h>
+#include <asm/msr-index.h>
+#include "cpuflags.h"
+
+struct cpu_features cpu;
+u32 cpu_vendor[3];
+
+static bool loaded_flags;
+
+static int has_fpu(void)
+{
+	u16 fcw = -1, fsw = -1;
+	unsigned long cr0;
+
+	asm volatile("mov %%cr0,%0" : "=r" (cr0));
+	if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
+		cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
+		asm volatile("mov %0,%%cr0" : : "r" (cr0));
+	}
+
+	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+		     : "+m" (fsw), "+m" (fcw));
+
+	return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+int has_eflag(unsigned long mask)
+{
+	unsigned long f0, f1;
+
+	asm volatile("pushf	\n\t"
+		     "pushf	\n\t"
+		     "pop %0	\n\t"
+		     "mov %0,%1	\n\t"
+		     "xor %2,%1	\n\t"
+		     "push %1	\n\t"
+		     "popf	\n\t"
+		     "pushf	\n\t"
+		     "pop %1	\n\t"
+		     "popf"
+		     : "=&r" (f0), "=&r" (f1)
+		     : "ri" (mask));
+
+	return !!((f0^f1) & mask);
+}
+
+/* Handle x86_32 PIC using ebx. */
+#if defined(__i386__) && defined(__PIC__)
+# define EBX_REG "=r"
+#else
+# define EBX_REG "=b"
+#endif
+
+static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+	asm volatile(".ifnc %%ebx,%3 ; movl  %%ebx,%3 ; .endif	\n\t"
+		     "cpuid					\n\t"
+		     ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif	\n\t"
+		    : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
+		    : "a" (id)
+	);
+}
+
+void get_flags(void)
+{
+	u32 max_intel_level, max_amd_level;
+	u32 tfms;
+	u32 ignored;
+
+	if (loaded_flags)
+		return;
+	loaded_flags = true;
+
+	if (has_fpu())
+		set_bit(X86_FEATURE_FPU, cpu.flags);
+
+	if (has_eflag(X86_EFLAGS_ID)) {
+		cpuid(0x0, &max_intel_level, &cpu_vendor[0], &cpu_vendor[2],
+		      &cpu_vendor[1]);
+
+		if (max_intel_level >= 0x00000001 &&
+		    max_intel_level <= 0x0000ffff) {
+			cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
+			      &cpu.flags[0]);
+			cpu.level = (tfms >> 8) & 15;
+			cpu.model = (tfms >> 4) & 15;
+			if (cpu.level >= 6)
+				cpu.model += ((tfms >> 16) & 0xf) << 4;
+		}
+
+		cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
+		      &ignored);
+
+		if (max_amd_level >= 0x80000001 &&
+		    max_amd_level <= 0x8000ffff) {
+			cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6],
+			      &cpu.flags[1]);
+		}
+	}
+}
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
new file mode 100644
index 0000000..9bb4e25
--- /dev/null
+++ b/arch/x86/boot/cpuflags.h
@@ -0,0 +1,19 @@
+#ifndef BOOT_CPUFLAGS_H
+#define BOOT_CPUFLAGS_H
+
+#include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
+
+struct cpu_features {
+	int level;		/* Family, or 64 for x86-64 */
+	int model;
+	u32 flags[NCAPINTS];
+};
+
+extern struct cpu_features cpu;
+extern u32 cpu_vendor[3];
+
+int has_eflag(unsigned long mask);
+void get_flags(void);
+
+#endif
-- 
cgit v1.1


From 8ab3820fd5b2896d66da7bb2a906bc382e63e7bc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 10 Oct 2013 17:18:14 -0700
Subject: x86, kaslr: Return location from decompress_kernel

This allows decompress_kernel to return a new location for the kernel to
be relocated to. Additionally, enforces CONFIG_PHYSICAL_START as the
minimum relocation position when building with CONFIG_RELOCATABLE.

With CONFIG_RANDOMIZE_BASE set, the choose_kernel_location routine
will select a new location to decompress the kernel, though here it is
presently a no-op. The kernel command line option "nokaslr" is introduced
to bypass these routines.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1381450698-28710-3-git-send-email-keescook@chromium.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                   | 38 ++++++++++++++++++++++++++++++++++----
 arch/x86/boot/compressed/Makefile  |  2 +-
 arch/x86/boot/compressed/aslr.c    | 23 +++++++++++++++++++++++
 arch/x86/boot/compressed/cmdline.c |  2 +-
 arch/x86/boot/compressed/head_32.S | 10 ++++++----
 arch/x86/boot/compressed/head_64.S | 16 ++++++++++------
 arch/x86/boot/compressed/misc.c    |  8 ++++++--
 arch/x86/boot/compressed/misc.h    | 27 +++++++++++++++++++++------
 8 files changed, 102 insertions(+), 24 deletions(-)
 create mode 100644 arch/x86/boot/compressed/aslr.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ee2fb9d..992701d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1722,16 +1722,46 @@ config RELOCATABLE
 
 	  Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
 	  it has been loaded at and the compile time physical address
-	  (CONFIG_PHYSICAL_START) is ignored.
+	  (CONFIG_PHYSICAL_START) is used as the minimum location.
 
-# Relocation on x86-32 needs some additional build support
+config RANDOMIZE_BASE
+	bool "Randomize the address of the kernel image"
+	depends on RELOCATABLE
+	depends on !HIBERNATION
+	default n
+	---help---
+	   Randomizes the physical and virtual address at which the
+	   kernel image is decompressed, as a security feature that
+	   deters exploit attempts relying on knowledge of the location
+	   of kernel internals.
+
+	   Entropy is generated using the RDRAND instruction if it
+	   is supported.  If not, then RDTSC is used, if supported. If
+	   neither RDRAND nor RDTSC are supported, then no randomness
+	   is introduced.
+
+	   The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
+	   and aligned according to PHYSICAL_ALIGN.
+
+config RANDOMIZE_BASE_MAX_OFFSET
+	hex "Maximum ASLR offset allowed"
+	depends on RANDOMIZE_BASE
+	default "0x10000000"
+	range 0x0 0x10000000
+	---help---
+	 Determines the maximal offset in bytes that will be applied to the
+	 kernel when Address Space Layout Randomization (ASLR) is active.
+	 Must be less than or equal to the actual physical memory on the
+	 system. This must be a power of two.
+
+# Relocation on x86 needs some additional build support
 config X86_NEED_RELOCS
 	def_bool y
-	depends on X86_32 && RELOCATABLE
+	depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
 
 config PHYSICAL_ALIGN
 	hex "Alignment value to which kernel should be aligned"
-	default "0x1000000"
+	default "0x200000"
 	range 0x2000 0x1000000 if X86_32
 	range 0x200000 0x1000000 if X86_64
 	---help---
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 3312f1be..ae8b5db 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,7 +27,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include
 
 VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
 	$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
-	$(obj)/piggy.o $(obj)/cpuflags.o
+	$(obj)/piggy.o $(obj)/cpuflags.o $(obj)/aslr.o
 
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
new file mode 100644
index 0000000..b73cc66
--- /dev/null
+++ b/arch/x86/boot/compressed/aslr.c
@@ -0,0 +1,23 @@
+#include "misc.h"
+
+#ifdef CONFIG_RANDOMIZE_BASE
+
+unsigned char *choose_kernel_location(unsigned char *input,
+				      unsigned long input_size,
+				      unsigned char *output,
+				      unsigned long output_size)
+{
+	unsigned long choice = (unsigned long)output;
+
+	if (cmdline_find_option_bool("nokaslr")) {
+		debug_putstr("KASLR disabled...\n");
+		goto out;
+	}
+
+	/* XXX: choose random location. */
+
+out:
+	return (unsigned char *)choice;
+}
+
+#endif /* CONFIG_RANDOMIZE_BASE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index bffd73b..b68e303 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,6 +1,6 @@
 #include "misc.h"
 
-#ifdef CONFIG_EARLY_PRINTK
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
 
 static unsigned long fs;
 static inline void set_fs(unsigned long seg)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 5d6f6891..9116aac 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -117,9 +117,11 @@ preferred_addr:
 	addl    %eax, %ebx
 	notl	%eax
 	andl    %eax, %ebx
-#else
-	movl	$LOAD_PHYSICAL_ADDR, %ebx
+	cmpl	$LOAD_PHYSICAL_ADDR, %ebx
+	jge	1f
 #endif
+	movl	$LOAD_PHYSICAL_ADDR, %ebx
+1:
 
 	/* Target address to relocate to for decompression */
 	addl	$z_extract_offset, %ebx
@@ -191,14 +193,14 @@ relocated:
 	leal	boot_heap(%ebx), %eax
 	pushl	%eax		/* heap area */
 	pushl	%esi		/* real mode pointer */
-	call	decompress_kernel
+	call	decompress_kernel /* returns kernel location in %eax */
 	addl	$24, %esp
 
 /*
  * Jump to the decompressed kernel.
  */
 	xorl	%ebx, %ebx
-	jmp	*%ebp
+	jmp	*%eax
 
 /*
  * Stack and heap for uncompression
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index c337422..c5c1ae0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -94,9 +94,11 @@ ENTRY(startup_32)
 	addl	%eax, %ebx
 	notl	%eax
 	andl	%eax, %ebx
-#else
-	movl	$LOAD_PHYSICAL_ADDR, %ebx
+	cmpl	$LOAD_PHYSICAL_ADDR, %ebx
+	jge	1f
 #endif
+	movl	$LOAD_PHYSICAL_ADDR, %ebx
+1:
 
 	/* Target address to relocate to for decompression */
 	addl	$z_extract_offset, %ebx
@@ -269,9 +271,11 @@ preferred_addr:
 	addq	%rax, %rbp
 	notq	%rax
 	andq	%rax, %rbp
-#else
-	movq	$LOAD_PHYSICAL_ADDR, %rbp
+	cmpq	$LOAD_PHYSICAL_ADDR, %rbp
+	jge	1f
 #endif
+	movq	$LOAD_PHYSICAL_ADDR, %rbp
+1:
 
 	/* Target address to relocate to for decompression */
 	leaq	z_extract_offset(%rbp), %rbx
@@ -339,13 +343,13 @@ relocated:
 	movl	$z_input_len, %ecx	/* input_len */
 	movq	%rbp, %r8		/* output target address */
 	movq	$z_output_len, %r9	/* decompressed length */
-	call	decompress_kernel
+	call	decompress_kernel	/* returns kernel location in %rax */
 	popq	%rsi
 
 /*
  * Jump to the decompressed kernel.
  */
-	jmp	*%rbp
+	jmp	*%rax
 
 	.code32
 no_longmode:
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 434f077..7138768 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -395,7 +395,7 @@ static void parse_elf(void *output)
 	free(phdrs);
 }
 
-asmlinkage void decompress_kernel(void *rmode, memptr heap,
+asmlinkage void *decompress_kernel(void *rmode, memptr heap,
 				  unsigned char *input_data,
 				  unsigned long input_len,
 				  unsigned char *output,
@@ -422,6 +422,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
+	output = choose_kernel_location(input_data, input_len,
+					output, output_len);
+
+	/* Validate memory location choices. */
 	if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
 		error("Destination address inappropriately aligned");
 #ifdef CONFIG_X86_64
@@ -441,5 +445,5 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 	parse_elf(output);
 	handle_relocations(output, output_len);
 	debug_putstr("done.\nBooting the kernel.\n");
-	return;
+	return output;
 }
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 674019d..9077af7 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -39,23 +39,38 @@ static inline void debug_putstr(const char *s)
 
 #endif
 
-#ifdef CONFIG_EARLY_PRINTK
-
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
 /* cmdline.c */
 int cmdline_find_option(const char *option, char *buffer, int bufsize);
 int cmdline_find_option_bool(const char *option);
+#endif
 
-/* early_serial_console.c */
-extern int early_serial_base;
-void console_init(void);
 
+#if CONFIG_RANDOMIZE_BASE
+/* aslr.c */
+unsigned char *choose_kernel_location(unsigned char *input,
+				      unsigned long input_size,
+				      unsigned char *output,
+				      unsigned long output_size);
 #else
+static inline
+unsigned char *choose_kernel_location(unsigned char *input,
+				      unsigned long input_size,
+				      unsigned char *output,
+				      unsigned long output_size)
+{
+	return output;
+}
+#endif
 
+#ifdef CONFIG_EARLY_PRINTK
 /* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+#else
 static const int early_serial_base;
 static inline void console_init(void)
 { }
-
 #endif
 
 #endif
-- 
cgit v1.1


From 5bfce5ef55cbe78ee2ee6e97f2e26a8a582008f3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 10 Oct 2013 17:18:15 -0700
Subject: x86, kaslr: Provide randomness functions

Adds potential sources of randomness: RDRAND, RDTSC, or the i8254.

This moves the pre-alternatives inline rdrand function into the header so
both pieces of code can use it. Availability of RDRAND is then controlled
by CONFIG_ARCH_RANDOM, if someone wants to disable it even for kASLR.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1381450698-28710-4-git-send-email-keescook@chromium.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/aslr.c   | 53 +++++++++++++++++++++++++++++++++++++++
 arch/x86/boot/compressed/misc.h   |  2 ++
 arch/x86/include/asm/archrandom.h | 21 ++++++++++++++++
 arch/x86/kernel/cpu/rdrand.c      | 14 -----------
 4 files changed, 76 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index b73cc66..14b24e0 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -1,6 +1,59 @@
 #include "misc.h"
 
 #ifdef CONFIG_RANDOMIZE_BASE
+#include <asm/msr.h>
+#include <asm/archrandom.h>
+
+#define I8254_PORT_CONTROL	0x43
+#define I8254_PORT_COUNTER0	0x40
+#define I8254_CMD_READBACK	0xC0
+#define I8254_SELECT_COUNTER0	0x02
+#define I8254_STATUS_NOTREADY	0x40
+static inline u16 i8254(void)
+{
+	u16 status, timer;
+
+	do {
+		outb(I8254_PORT_CONTROL,
+		     I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+		status = inb(I8254_PORT_COUNTER0);
+		timer  = inb(I8254_PORT_COUNTER0);
+		timer |= inb(I8254_PORT_COUNTER0) << 8;
+	} while (status & I8254_STATUS_NOTREADY);
+
+	return timer;
+}
+
+static unsigned long get_random_long(void)
+{
+	unsigned long random;
+
+	if (has_cpuflag(X86_FEATURE_RDRAND)) {
+		debug_putstr("KASLR using RDRAND...\n");
+		if (rdrand_long(&random))
+			return random;
+	}
+
+	if (has_cpuflag(X86_FEATURE_TSC)) {
+		uint32_t raw;
+
+		debug_putstr("KASLR using RDTSC...\n");
+		rdtscl(raw);
+
+		/* Only use the low bits of rdtsc. */
+		random = raw & 0xffff;
+	} else {
+		debug_putstr("KASLR using i8254...\n");
+		random = i8254();
+	}
+
+	/* Extend timer bits poorly... */
+	random |= (random << 16);
+#ifdef CONFIG_X86_64
+	random |= (random << 32);
+#endif
+	return random;
+}
 
 unsigned char *choose_kernel_location(unsigned char *input,
 				      unsigned long input_size,
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 9077af7..0782eb0 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -52,6 +52,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size);
+/* cpuflags.c */
+bool has_cpuflag(int flag);
 #else
 static inline
 unsigned char *choose_kernel_location(unsigned char *input,
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index 0d9ec77..e6a9245 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -39,6 +39,20 @@
 
 #ifdef CONFIG_ARCH_RANDOM
 
+/* Instead of arch_get_random_long() when alternatives haven't run. */
+static inline int rdrand_long(unsigned long *v)
+{
+	int ok;
+	asm volatile("1: " RDRAND_LONG "\n\t"
+		     "jc 2f\n\t"
+		     "decl %0\n\t"
+		     "jnz 1b\n\t"
+		     "2:"
+		     : "=r" (ok), "=a" (*v)
+		     : "0" (RDRAND_RETRY_LOOPS));
+	return ok;
+}
+
 #define GET_RANDOM(name, type, rdrand, nop)			\
 static inline int name(type *v)					\
 {								\
@@ -68,6 +82,13 @@ GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3);
 
 #endif /* CONFIG_X86_64 */
 
+#else
+
+static inline int rdrand_long(unsigned long *v)
+{
+	return 0;
+}
+
 #endif  /* CONFIG_ARCH_RANDOM */
 
 extern void x86_init_rdrand(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 88db010..384df51 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s)
 }
 __setup("nordrand", x86_rdrand_setup);
 
-/* We can't use arch_get_random_long() here since alternatives haven't run */
-static inline int rdrand_long(unsigned long *v)
-{
-	int ok;
-	asm volatile("1: " RDRAND_LONG "\n\t"
-		     "jc 2f\n\t"
-		     "decl %0\n\t"
-		     "jnz 1b\n\t"
-		     "2:"
-		     : "=r" (ok), "=a" (*v)
-		     : "0" (RDRAND_RETRY_LOOPS));
-	return ok;
-}
-
 /*
  * Force a reseed cycle; we are architecturally guaranteed a reseed
  * after no more than 512 128-bit chunks of random data.  This also
-- 
cgit v1.1


From 82fa9637a2ba285bcc7c5050c73010b2c1b3d803 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 10 Oct 2013 17:18:16 -0700
Subject: x86, kaslr: Select random position from e820 maps

Counts available alignment positions across all e820 maps, and chooses
one randomly for the new kernel base address, making sure not to collide
with unsafe memory areas.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1381450698-28710-5-git-send-email-keescook@chromium.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/aslr.c | 193 +++++++++++++++++++++++++++++++++++++++-
 arch/x86/boot/compressed/misc.c |  10 +--
 arch/x86/boot/compressed/misc.h |   8 ++
 3 files changed, 202 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 14b24e0..0595798 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -3,6 +3,7 @@
 #ifdef CONFIG_RANDOMIZE_BASE
 #include <asm/msr.h>
 #include <asm/archrandom.h>
+#include <asm/e820.h>
 
 #define I8254_PORT_CONTROL	0x43
 #define I8254_PORT_COUNTER0	0x40
@@ -55,20 +56,210 @@ static unsigned long get_random_long(void)
 	return random;
 }
 
+struct mem_vector {
+	unsigned long start;
+	unsigned long size;
+};
+
+#define MEM_AVOID_MAX 5
+struct mem_vector mem_avoid[MEM_AVOID_MAX];
+
+static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
+{
+	/* Item at least partially before region. */
+	if (item->start < region->start)
+		return false;
+	/* Item at least partially after region. */
+	if (item->start + item->size > region->start + region->size)
+		return false;
+	return true;
+}
+
+static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
+{
+	/* Item one is entirely before item two. */
+	if (one->start + one->size <= two->start)
+		return false;
+	/* Item one is entirely after item two. */
+	if (one->start >= two->start + two->size)
+		return false;
+	return true;
+}
+
+static void mem_avoid_init(unsigned long input, unsigned long input_size,
+			   unsigned long output, unsigned long output_size)
+{
+	u64 initrd_start, initrd_size;
+	u64 cmd_line, cmd_line_size;
+	unsigned long unsafe, unsafe_len;
+	char *ptr;
+
+	/*
+	 * Avoid the region that is unsafe to overlap during
+	 * decompression (see calculations at top of misc.c).
+	 */
+	unsafe_len = (output_size >> 12) + 32768 + 18;
+	unsafe = (unsigned long)input + input_size - unsafe_len;
+	mem_avoid[0].start = unsafe;
+	mem_avoid[0].size = unsafe_len;
+
+	/* Avoid initrd. */
+	initrd_start  = (u64)real_mode->ext_ramdisk_image << 32;
+	initrd_start |= real_mode->hdr.ramdisk_image;
+	initrd_size  = (u64)real_mode->ext_ramdisk_size << 32;
+	initrd_size |= real_mode->hdr.ramdisk_size;
+	mem_avoid[1].start = initrd_start;
+	mem_avoid[1].size = initrd_size;
+
+	/* Avoid kernel command line. */
+	cmd_line  = (u64)real_mode->ext_cmd_line_ptr << 32;
+	cmd_line |= real_mode->hdr.cmd_line_ptr;
+	/* Calculate size of cmd_line. */
+	ptr = (char *)(unsigned long)cmd_line;
+	for (cmd_line_size = 0; ptr[cmd_line_size++]; )
+		;
+	mem_avoid[2].start = cmd_line;
+	mem_avoid[2].size = cmd_line_size;
+
+	/* Avoid heap memory. */
+	mem_avoid[3].start = (unsigned long)free_mem_ptr;
+	mem_avoid[3].size = BOOT_HEAP_SIZE;
+
+	/* Avoid stack memory. */
+	mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
+	mem_avoid[4].size = BOOT_STACK_SIZE;
+}
+
+/* Does this memory vector overlap a known avoided area? */
+bool mem_avoid_overlap(struct mem_vector *img)
+{
+	int i;
+
+	for (i = 0; i < MEM_AVOID_MAX; i++) {
+		if (mem_overlaps(img, &mem_avoid[i]))
+			return true;
+	}
+
+	return false;
+}
+
+unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN];
+unsigned long slot_max = 0;
+
+static void slots_append(unsigned long addr)
+{
+	/* Overflowing the slots list should be impossible. */
+	if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
+			CONFIG_PHYSICAL_ALIGN)
+		return;
+
+	slots[slot_max++] = addr;
+}
+
+static unsigned long slots_fetch_random(void)
+{
+	/* Handle case of no slots stored. */
+	if (slot_max == 0)
+		return 0;
+
+	return slots[get_random_long() % slot_max];
+}
+
+static void process_e820_entry(struct e820entry *entry,
+			       unsigned long minimum,
+			       unsigned long image_size)
+{
+	struct mem_vector region, img;
+
+	/* Skip non-RAM entries. */
+	if (entry->type != E820_RAM)
+		return;
+
+	/* Ignore entries entirely above our maximum. */
+	if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
+		return;
+
+	/* Ignore entries entirely below our minimum. */
+	if (entry->addr + entry->size < minimum)
+		return;
+
+	region.start = entry->addr;
+	region.size = entry->size;
+
+	/* Potentially raise address to minimum location. */
+	if (region.start < minimum)
+		region.start = minimum;
+
+	/* Potentially raise address to meet alignment requirements. */
+	region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
+
+	/* Did we raise the address above the bounds of this e820 region? */
+	if (region.start > entry->addr + entry->size)
+		return;
+
+	/* Reduce size by any delta from the original address. */
+	region.size -= region.start - entry->addr;
+
+	/* Reduce maximum size to fit end of image within maximum limit. */
+	if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
+		region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
+
+	/* Walk each aligned slot and check for avoided areas. */
+	for (img.start = region.start, img.size = image_size ;
+	     mem_contains(&region, &img) ;
+	     img.start += CONFIG_PHYSICAL_ALIGN) {
+		if (mem_avoid_overlap(&img))
+			continue;
+		slots_append(img.start);
+	}
+}
+
+static unsigned long find_random_addr(unsigned long minimum,
+				      unsigned long size)
+{
+	int i;
+	unsigned long addr;
+
+	/* Make sure minimum is aligned. */
+	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+	/* Verify potential e820 positions, appending to slots list. */
+	for (i = 0; i < real_mode->e820_entries; i++) {
+		process_e820_entry(&real_mode->e820_map[i], minimum, size);
+	}
+
+	return slots_fetch_random();
+}
+
 unsigned char *choose_kernel_location(unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size)
 {
 	unsigned long choice = (unsigned long)output;
+	unsigned long random;
 
 	if (cmdline_find_option_bool("nokaslr")) {
 		debug_putstr("KASLR disabled...\n");
 		goto out;
 	}
 
-	/* XXX: choose random location. */
+	/* Record the various known unsafe memory ranges. */
+	mem_avoid_init((unsigned long)input, input_size,
+		       (unsigned long)output, output_size);
+
+	/* Walk e820 and find a random address. */
+	random = find_random_addr(choice, output_size);
+	if (!random) {
+		debug_putstr("KASLR could not find suitable E820 region...\n");
+		goto out;
+	}
+
+	/* Always enforce the minimum. */
+	if (random < choice)
+		goto out;
 
+	choice = random;
 out:
 	return (unsigned char *)choice;
 }
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7138768..196eaf3 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -112,14 +112,8 @@ struct boot_params *real_mode;		/* Pointer to real-mode data */
 void *memset(void *s, int c, size_t n);
 void *memcpy(void *dest, const void *src, size_t n);
 
-#ifdef CONFIG_X86_64
-#define memptr long
-#else
-#define memptr unsigned
-#endif
-
-static memptr free_mem_ptr;
-static memptr free_mem_end_ptr;
+memptr free_mem_ptr;
+memptr free_mem_end_ptr;
 
 static char *vidmem;
 static int vidport;
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 0782eb0..24e3e56 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -23,7 +23,15 @@
 #define BOOT_BOOT_H
 #include "../ctype.h"
 
+#ifdef CONFIG_X86_64
+#define memptr long
+#else
+#define memptr unsigned
+#endif
+
 /* misc.c */
+extern memptr free_mem_ptr;
+extern memptr free_mem_end_ptr;
 extern struct boot_params *real_mode;		/* Pointer to real-mode data */
 void __putstr(const char *s);
 #define error_putstr(__x)  __putstr(__x)
-- 
cgit v1.1


From f32360ef6608434a032dc7ad262d45e9693c27f3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 10 Oct 2013 17:18:17 -0700
Subject: x86, kaslr: Report kernel offset on panic

When the system panics, include the kernel offset in the report to assist
in debugging.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1381450698-28710-6-git-send-email-keescook@chromium.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/setup.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f0de629..1708862 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -824,6 +824,20 @@ static void __init trim_low_memory_range(void)
 }
 	
 /*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+	pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
+		 "(relocation range: 0x%lx-0x%lx)\n",
+		 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
+		 __START_KERNEL_map, MODULES_VADDR-1);
+
+	return 0;
+}
+
+/*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
  * for initialization.  Note, the efi init code path is determined by the
@@ -1242,3 +1256,15 @@ void __init i386_reserve_resources(void)
 }
 
 #endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+	.notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list,
+					&kernel_offset_notifier);
+	return 0;
+}
+__initcall(register_kernel_offset_dumper);
-- 
cgit v1.1


From 6145cfe394a7f138f6b64491c5663f97dba12450 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 10 Oct 2013 17:18:18 -0700
Subject: x86, kaslr: Raise the maximum virtual address to -1 GiB on x86_64

On 64-bit, this raises the maximum location to -1 GiB (from -1.5 GiB),
the upper limit currently, since the kernel fixmap page mappings need
to be moved to use the other 1 GiB (which would be the theoretical
limit when building with -mcmodel=kernel).

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1381450698-28710-7-git-send-email-keescook@chromium.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                        | 16 +++++++++++++---
 arch/x86/include/asm/page_64_types.h    | 15 ++++++++++++---
 arch/x86/include/asm/pgtable_64_types.h |  2 +-
 arch/x86/mm/init_32.c                   |  3 +++
 4 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 992701d..51f4399 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1746,13 +1746,23 @@ config RANDOMIZE_BASE
 config RANDOMIZE_BASE_MAX_OFFSET
 	hex "Maximum ASLR offset allowed"
 	depends on RANDOMIZE_BASE
-	default "0x10000000"
-	range 0x0 0x10000000
+	range 0x0 0x20000000 if X86_32
+	default "0x20000000" if X86_32
+	range 0x0 0x40000000 if X86_64
+	default "0x40000000" if X86_64
 	---help---
 	 Determines the maximal offset in bytes that will be applied to the
 	 kernel when Address Space Layout Randomization (ASLR) is active.
 	 Must be less than or equal to the actual physical memory on the
-	 system. This must be a power of two.
+	 system. This must be a multiple of CONFIG_PHYSICAL_ALIGN.
+
+	 On 32-bit this is limited to 512MiB.
+
+	 On 64-bit this is limited by how the kernel fixmap page table is
+	 positioned, so this cannot be larger that 1GiB currently. Normally
+	 there is a 512MiB to 1.5GiB split between kernel and modules. When
+	 this is raised above the 512MiB default, the modules area will
+	 shrink to compensate, up to the current maximum 1GiB to 1GiB split.
 
 # Relocation on x86 needs some additional build support
 config X86_NEED_RELOCS
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 43dcd80..8de6d9c 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -39,9 +39,18 @@
 #define __VIRTUAL_MASK_SHIFT	47
 
 /*
- * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
- * arch/x86/kernel/head_64.S), and it is mapped here:
+ * Kernel image size is limited to 1GiB due to the fixmap living in the
+ * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
+ * 512MiB by default, leaving 1.5GiB for modules once the page tables
+ * are fully set up. If kernel ASLR is configured, it can extend the
+ * kernel page table mapping, reducing the size of the modules area.
  */
-#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
+#define KERNEL_IMAGE_SIZE_DEFAULT      (512 * 1024 * 1024)
+#if defined(CONFIG_RANDOMIZE_BASE) && \
+	CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT
+#define KERNEL_IMAGE_SIZE   CONFIG_RANDOMIZE_BASE_MAX_OFFSET
+#else
+#define KERNEL_IMAGE_SIZE      KERNEL_IMAGE_SIZE_DEFAULT
+#endif
 
 #endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 2d88344..c883bf7 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -58,7 +58,7 @@ typedef struct { pteval_t pte; } pte_t;
 #define VMALLOC_START    _AC(0xffffc90000000000, UL)
 #define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
 #define VMEMMAP_START	 _AC(0xffffea0000000000, UL)
-#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
+#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4287f1f..5bdc543 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -806,6 +806,9 @@ void __init mem_init(void)
 	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);
 #undef high_memory
 #undef __FIXADDR_TOP
+#ifdef CONFIG_RANDOMIZE_BASE
+	BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
+#endif
 
 #ifdef CONFIG_HIGHMEM
 	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
-- 
cgit v1.1


From 6e6a4932b0f569b1a5bb4fcbf5dde1b1a42f01bb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Sun, 13 Oct 2013 04:08:56 -0700
Subject: x86, boot: Rename get_flags() and check_flags() to *_cpuflags()

When a function is used in more than one file it may not be possible
to immediately tell from context what the intended meaning is.  As
such, it is more important that the naming be self-evident.  Thus,
change get_flags() to get_cpuflags().

For consistency, change check_flags() to check_cpuflags() even though
it is only used in cpucheck.c.

Link: http://lkml.kernel.org/r/1381450698-28710-2-git-send-email-keescook@chromium.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/cpuflags.c |  2 +-
 arch/x86/boot/cpucheck.c            | 14 +++++++-------
 arch/x86/boot/cpuflags.c            |  2 +-
 arch/x86/boot/cpuflags.h            |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
index 931cba6..aa31346 100644
--- a/arch/x86/boot/compressed/cpuflags.c
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -4,7 +4,7 @@
 
 bool has_cpuflag(int flag)
 {
-	get_flags();
+	get_cpuflags();
 
 	return test_bit(flag, cpu.flags);
 }
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index e1f3c16..100a9a1 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -68,7 +68,7 @@ static int is_transmeta(void)
 }
 
 /* Returns a bitmask of which words we have error bits in */
-static int check_flags(void)
+static int check_cpuflags(void)
 {
 	u32 err;
 	int i;
@@ -101,8 +101,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 	if (has_eflag(X86_EFLAGS_AC))
 		cpu.level = 4;
 
-	get_flags();
-	err = check_flags();
+	get_cpuflags();
+	err = check_cpuflags();
 
 	if (test_bit(X86_FEATURE_LM, cpu.flags))
 		cpu.level = 64;
@@ -121,8 +121,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 		eax &= ~(1 << 15);
 		asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
 
-		get_flags();	/* Make sure it really did something */
-		err = check_flags();
+		get_cpuflags();	/* Make sure it really did something */
+		err = check_cpuflags();
 	} else if (err == 0x01 &&
 		   !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
 		   is_centaur() && cpu.model >= 6) {
@@ -137,7 +137,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 		asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
 
 		set_bit(X86_FEATURE_CX8, cpu.flags);
-		err = check_flags();
+		err = check_cpuflags();
 	} else if (err == 0x01 && is_transmeta()) {
 		/* Transmeta might have masked feature bits in word 0 */
 
@@ -152,7 +152,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 		    : : "ecx", "ebx");
 		asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
 
-		err = check_flags();
+		err = check_cpuflags();
 	}
 
 	if (err_flags_ptr)
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index b02544a..a9fcb7c 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -65,7 +65,7 @@ static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
 	);
 }
 
-void get_flags(void)
+void get_cpuflags(void)
 {
 	u32 max_intel_level, max_amd_level;
 	u32 tfms;
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index 9bb4e25..ea97697 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -14,6 +14,6 @@ extern struct cpu_features cpu;
 extern u32 cpu_vendor[3];
 
 int has_eflag(unsigned long mask);
-void get_flags(void);
+void get_cpuflags(void);
 
 #endif
-- 
cgit v1.1


From aec58bafaf89279522c44ec8ca9211eabb2b6976 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 15 Oct 2013 23:43:14 -0700
Subject: x86/relocs: Add percpu fixup for GNU ld 2.23

The GNU linker tries to put __per_cpu_load into the percpu area,
resulting in a lack of its relocation. Force this symbol to be
relocated. Seen starting with GNU ld 2.23 and later.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Michael Davidson <md@google.com>
Cc: Cong Ding <dinggnu@gmail.com>
Link: http://lkml.kernel.org/r/20131016064314.GA2739@www.outflux.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/tools/relocs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 71a2533..11f9285 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -729,6 +729,7 @@ static void percpu_init(void)
  *
  * The GNU linker incorrectly associates:
  *	__init_begin
+ *	__per_cpu_load
  *
  * The "gold" linker incorrectly associates:
  *	init_per_cpu__irq_stack_union
@@ -738,6 +739,7 @@ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
 {
 	return (sym->st_shndx == per_cpu_shndx) &&
 		strcmp(symname, "__init_begin") &&
+		strcmp(symname, "__per_cpu_load") &&
 		strncmp(symname, "init_per_cpu_", 13);
 }
 
-- 
cgit v1.1


From f4fccac05f7f6bacb8e481a84d175e85ffcf9fe2 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:24:59 +0100
Subject: x86/efi: Simplify EFI_DEBUG

... and lose one #ifdef .. #endif sandwich.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1d3372a..f396163 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -51,7 +51,7 @@
 #include <asm/x86_init.h>
 #include <asm/rtc.h>
 
-#define EFI_DEBUG	1
+#define EFI_DEBUG
 
 #define EFI_MIN_RESERVE 5120
 
@@ -398,9 +398,9 @@ int __init efi_memblock_x86_reserve_range(void)
 	return 0;
 }
 
-#if EFI_DEBUG
 static void __init print_efi_memmap(void)
 {
+#ifdef EFI_DEBUG
 	efi_memory_desc_t *md;
 	void *p;
 	int i;
@@ -415,8 +415,8 @@ static void __init print_efi_memmap(void)
 			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
 			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
 	}
-}
 #endif  /*  EFI_DEBUG  */
+}
 
 void __init efi_reserve_boot_services(void)
 {
@@ -696,10 +696,7 @@ void __init efi_init(void)
 		x86_platform.set_wallclock = efi_set_rtc_mmss;
 	}
 #endif
-
-#if EFI_DEBUG
 	print_efi_memmap();
-#endif
 }
 
 void __init efi_late_init(void)
-- 
cgit v1.1


From 0fd64c23fdf556e9e68580cff03b3505797bbf53 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:00 +0100
Subject: x86/mm/pageattr: Lookup address in an arbitrary PGD

This is preparatory work in order to be able to map pages into a
specified PGD and not implicitly and only into init_mm.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb32480..c53de62 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -30,6 +30,7 @@
  */
 struct cpa_data {
 	unsigned long	*vaddr;
+	pgd_t		*pgd;
 	pgprot_t	mask_set;
 	pgprot_t	mask_clr;
 	int		numpages;
@@ -322,17 +323,9 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	return prot;
 }
 
-/*
- * Lookup the page table entry for a virtual address. Return a pointer
- * to the entry and the level of the mapping.
- *
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
- */
-pte_t *lookup_address(unsigned long address, unsigned int *level)
+static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+				      unsigned int *level)
 {
-	pgd_t *pgd = pgd_offset_k(address);
 	pud_t *pud;
 	pmd_t *pmd;
 
@@ -361,8 +354,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
 
 	return pte_offset_kernel(pmd, address);
 }
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+        return __lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
 EXPORT_SYMBOL_GPL(lookup_address);
 
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+				  unsigned int *level)
+{
+        if (cpa->pgd)
+		return __lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+					       address, level);
+
+        return lookup_address(address, level);
+}
+
 /*
  * This is necessary because __pa() does not work on some
  * kinds of memory, like vmalloc() or the alloc_remap()
-- 
cgit v1.1


From f3f729661e8db476ac427a97de015307aebb7404 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:01 +0100
Subject: x86/mm/pageattr: Add a PGD pagetable populating function

This allocates, if necessary, and populates the corresponding PGD entry
with a PUD page. The next population level is a dummy macro which will
be removed by the next patch and it is added here to keep the patch
small and easily reviewable but not break bisection, at the same time.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index c53de62..4b47ae0 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -666,6 +666,45 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	return 0;
 }
 
+#define populate_pud(cpa, addr, pgd, pgprot)	(-1)
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+	bool allocd_pgd = false;
+	pgd_t *pgd_entry;
+	pud_t *pud = NULL;	/* shut up gcc */
+	int ret;
+
+	pgd_entry = cpa->pgd + pgd_index(addr);
+
+	/*
+	 * Allocate a PUD page and hand it down for mapping.
+	 */
+	if (pgd_none(*pgd_entry)) {
+		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		if (!pud)
+			return -1;
+
+		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+		allocd_pgd = true;
+	}
+
+	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
+
+	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+	if (ret < 0)
+		return ret;
+
+	cpa->numpages = ret;
+	return 0;
+}
+
 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 			       int primary)
 {
-- 
cgit v1.1


From 4b23538d88c87d9c693ad87c8c808e92a505a6e6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:02 +0100
Subject: x86/mm/pageattr: Add a PUD pagetable populating function

Add the next level of the pagetable populating function, we handle
chunks around a 1G boundary by mapping them with the lower level
functions - otherwise we use 1G pages for the mappings, thus using as
less amount of pagetable pages as possible.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4b47ae0..81deca7 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -666,7 +666,92 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	return 0;
 }
 
-#define populate_pud(cpa, addr, pgd, pgprot)	(-1)
+static int alloc_pmd_page(pud_t *pud)
+{
+	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	if (!pmd)
+		return -1;
+
+	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	return 0;
+}
+
+#define populate_pmd(cpa, start, end, pages, pud, pgprot)	(-1)
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+			pgprot_t pgprot)
+{
+	pud_t *pud;
+	unsigned long end;
+	int cur_pages = 0;
+
+	end = start + (cpa->numpages << PAGE_SHIFT);
+
+	/*
+	 * Not on a Gb page boundary? => map everything up to it with
+	 * smaller pages.
+	 */
+	if (start & (PUD_SIZE - 1)) {
+		unsigned long pre_end;
+		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+		pre_end   = min_t(unsigned long, end, next_page);
+		cur_pages = (pre_end - start) >> PAGE_SHIFT;
+		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+		pud = pud_offset(pgd, start);
+
+		/*
+		 * Need a PMD page?
+		 */
+		if (pud_none(*pud))
+			if (alloc_pmd_page(pud))
+				return -1;
+
+		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+					 pud, pgprot);
+		if (cur_pages < 0)
+			return cur_pages;
+
+		start = pre_end;
+	}
+
+	/* We mapped them all? */
+	if (cpa->numpages == cur_pages)
+		return cur_pages;
+
+	pud = pud_offset(pgd, start);
+
+	/*
+	 * Map everything starting from the Gb boundary, possibly with 1G pages
+	 */
+	while (end - start >= PUD_SIZE) {
+		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+		start	  += PUD_SIZE;
+		cpa->pfn  += PUD_SIZE;
+		cur_pages += PUD_SIZE >> PAGE_SHIFT;
+		pud++;
+	}
+
+	/* Map trailing leftover */
+	if (start < end) {
+		int tmp;
+
+		pud = pud_offset(pgd, start);
+		if (pud_none(*pud))
+			if (alloc_pmd_page(pud))
+				return -1;
+
+		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+				   pud, pgprot);
+		if (tmp < 0)
+			return cur_pages;
+
+		cur_pages += tmp;
+	}
+	return cur_pages;
+}
 
 /*
  * Restrictions for kernel page table do not necessarily apply when mapping in
-- 
cgit v1.1


From f900a4b8ab0f462d89a9fcb6173cac1403415b16 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:03 +0100
Subject: x86/mm/pageattr: Add a PMD pagetable populating function

Handle PMD-level mappings the same as PUD ones.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 81deca7..968398b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -666,6 +666,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	return 0;
 }
 
+static int alloc_pte_page(pmd_t *pmd)
+{
+	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	if (!pte)
+		return -1;
+
+	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+	return 0;
+}
+
 static int alloc_pmd_page(pud_t *pud)
 {
 	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -676,7 +686,77 @@ static int alloc_pmd_page(pud_t *pud)
 	return 0;
 }
 
-#define populate_pmd(cpa, start, end, pages, pud, pgprot)	(-1)
+#define populate_pte(cpa, start, end, pages, pmd, pgprot)	do {} while (0)
+
+static int populate_pmd(struct cpa_data *cpa,
+			unsigned long start, unsigned long end,
+			unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+	unsigned int cur_pages = 0;
+	pmd_t *pmd;
+
+	/*
+	 * Not on a 2M boundary?
+	 */
+	if (start & (PMD_SIZE - 1)) {
+		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+		pre_end   = min_t(unsigned long, pre_end, next_page);
+		cur_pages = (pre_end - start) >> PAGE_SHIFT;
+		cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+		/*
+		 * Need a PTE page?
+		 */
+		pmd = pmd_offset(pud, start);
+		if (pmd_none(*pmd))
+			if (alloc_pte_page(pmd))
+				return -1;
+
+		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+		start = pre_end;
+	}
+
+	/*
+	 * We mapped them all?
+	 */
+	if (num_pages == cur_pages)
+		return cur_pages;
+
+	while (end - start >= PMD_SIZE) {
+
+		/*
+		 * We cannot use a 1G page so allocate a PMD page if needed.
+		 */
+		if (pud_none(*pud))
+			if (alloc_pmd_page(pud))
+				return -1;
+
+		pmd = pmd_offset(pud, start);
+
+		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+		start	  += PMD_SIZE;
+		cpa->pfn  += PMD_SIZE;
+		cur_pages += PMD_SIZE >> PAGE_SHIFT;
+	}
+
+	/*
+	 * Map trailing 4K pages.
+	 */
+	if (start < end) {
+		pmd = pmd_offset(pud, start);
+		if (pmd_none(*pmd))
+			if (alloc_pte_page(pmd))
+				return -1;
+
+		populate_pte(cpa, start, end, num_pages - cur_pages,
+			     pmd, pgprot);
+	}
+	return num_pages;
+}
 
 static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 			pgprot_t pgprot)
-- 
cgit v1.1


From c6b6f363f7b24aa448994e3a65c4d5b3116acfcc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:04 +0100
Subject: x86/mm/pageattr: Add a PTE pagetable populating function

Handle last level by unconditionally writing the PTEs into the PTE page
while paying attention to the NX bit.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 968398b..2a1308a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -686,7 +686,27 @@ static int alloc_pmd_page(pud_t *pud)
 	return 0;
 }
 
-#define populate_pte(cpa, start, end, pages, pmd, pgprot)	do {} while (0)
+static void populate_pte(struct cpa_data *cpa,
+			 unsigned long start, unsigned long end,
+			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+	pte_t *pte;
+
+	pte = pte_offset_kernel(pmd, start);
+
+	while (num_pages-- && start < end) {
+
+		/* deal with the NX bit */
+		if (!(pgprot_val(pgprot) & _PAGE_NX))
+			cpa->pfn &= ~_PAGE_NX;
+
+		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+
+		start	 += PAGE_SIZE;
+		cpa->pfn += PAGE_SIZE;
+		pte++;
+	}
+}
 
 static int populate_pmd(struct cpa_data *cpa,
 			unsigned long start, unsigned long end,
-- 
cgit v1.1


From 0bb8aeee7b73b21e09d3ea12f2120d974f70b669 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:05 +0100
Subject: x86/mm/pageattr: Add a PUD error unwinding path

In case we encounter an error during the mapping of a region, we want to
unwind what we've established so far exactly the way we did the mapping.
This is the PUD part kept deliberately small for easier review.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2a1308a..1cbdbbc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -666,6 +666,51 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	return 0;
 }
 
+#define unmap_pmd_range(pud, start, pre_end)		do {} while (0)
+
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	pud_t *pud = pud_offset(pgd, start);
+
+	/*
+	 * Not on a GB page boundary?
+	 */
+	if (start & (PUD_SIZE - 1)) {
+		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+		unsigned long pre_end	= min_t(unsigned long, end, next_page);
+
+		unmap_pmd_range(pud, start, pre_end);
+
+		start = pre_end;
+		pud++;
+	}
+
+	/*
+	 * Try to unmap in 1G chunks?
+	 */
+	while (end - start >= PUD_SIZE) {
+
+		if (pud_large(*pud))
+			pud_clear(pud);
+		else
+			unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+		start += PUD_SIZE;
+		pud++;
+	}
+
+	/*
+	 * 2M leftovers?
+	 */
+	if (start < end)
+		unmap_pmd_range(pud, start, end);
+
+	/*
+	 * No need to try to free the PUD page because we'll free it in
+	 * populate_pgd's error path
+	 */
+}
+
 static int alloc_pte_page(pmd_t *pmd)
 {
 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -883,9 +928,20 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
 
 	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
-	if (ret < 0)
-		return ret;
+	if (ret < 0) {
+		unmap_pud_range(pgd_entry, addr,
+				addr + (cpa->numpages << PAGE_SHIFT));
 
+		if (allocd_pgd) {
+			/*
+			 * If I allocated this PUD page, I can just as well
+			 * free it in this error path.
+			 */
+			pgd_clear(pgd_entry);
+			free_page((unsigned long)pud);
+		}
+		return ret;
+	}
 	cpa->numpages = ret;
 	return 0;
 }
-- 
cgit v1.1


From 52a628fb454d13d944bb3c8a89a314cc3affa417 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:06 +0100
Subject: x86/mm/pageattr: Add last levels of error path

We try to free the pagetable pages once we've unmapped our portion.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1cbdbbc..db8ace2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -666,7 +666,99 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	return 0;
 }
 
-#define unmap_pmd_range(pud, start, pre_end)		do {} while (0)
+static bool try_to_free_pte_page(pte_t *pte)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		if (!pte_none(pte[i]))
+			return false;
+
+	free_page((unsigned long)pte);
+	return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++)
+		if (!pmd_none(pmd[i]))
+			return false;
+
+	free_page((unsigned long)pmd);
+	return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, start);
+
+	while (start < end) {
+		set_pte(pte, __pte(0));
+
+		start += PAGE_SIZE;
+		pte++;
+	}
+
+	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+		pmd_clear(pmd);
+		return true;
+	}
+	return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+			      unsigned long start, unsigned long end)
+{
+	if (unmap_pte_range(pmd, start, end))
+		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+			pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+	pmd_t *pmd = pmd_offset(pud, start);
+
+	/*
+	 * Not on a 2MB page boundary?
+	 */
+	if (start & (PMD_SIZE - 1)) {
+		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+		unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+		__unmap_pmd_range(pud, pmd, start, pre_end);
+
+		start = pre_end;
+		pmd++;
+	}
+
+	/*
+	 * Try to unmap in 2M chunks.
+	 */
+	while (end - start >= PMD_SIZE) {
+		if (pmd_large(*pmd))
+			pmd_clear(pmd);
+		else
+			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+		start += PMD_SIZE;
+		pmd++;
+	}
+
+	/*
+	 * 4K leftovers?
+	 */
+	if (start < end)
+		return __unmap_pmd_range(pud, pmd, start, end);
+
+	/*
+	 * Try again to free the PMD page if haven't succeeded above.
+	 */
+	if (!pud_none(*pud))
+		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+			pud_clear(pud);
+}
 
 static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 {
-- 
cgit v1.1


From 82f0712ca0f947170e785300b5c39d9c25e2f6ff Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:07 +0100
Subject: x86/mm/cpa: Map in an arbitrary pgd

Add the ability to map pages in an arbitrary pgd. This wires in the
remaining stuff so that there's a new interface with which you can map a
region into an arbitrary PGD.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/mm/pageattr.c | 53 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index db8ace2..b3b19f4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -453,7 +453,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * Check for races, another CPU might have split this page
 	 * up already:
 	 */
-	tmp = lookup_address(address, &level);
+	tmp = _lookup_address_cpa(cpa, address, &level);
 	if (tmp != kpte)
 		goto out_unlock;
 
@@ -559,7 +559,8 @@ out_unlock:
 }
 
 static int
-__split_large_page(pte_t *kpte, unsigned long address, struct page *base)
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+		   struct page *base)
 {
 	pte_t *pbase = (pte_t *)page_address(base);
 	unsigned long pfn, pfninc = 1;
@@ -572,7 +573,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
 	 */
-	tmp = lookup_address(address, &level);
+	tmp = _lookup_address_cpa(cpa, address, &level);
 	if (tmp != kpte) {
 		spin_unlock(&pgd_lock);
 		return 1;
@@ -648,7 +649,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
 	return 0;
 }
 
-static int split_large_page(pte_t *kpte, unsigned long address)
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+			    unsigned long address)
 {
 	struct page *base;
 
@@ -660,7 +662,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	if (!base)
 		return -ENOMEM;
 
-	if (__split_large_page(kpte, address, base))
+	if (__split_large_page(cpa, kpte, address, base))
 		__free_page(base);
 
 	return 0;
@@ -1041,6 +1043,9 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 			       int primary)
 {
+	if (cpa->pgd)
+		return populate_pgd(cpa, vaddr);
+
 	/*
 	 * Ignore all non primary paths.
 	 */
@@ -1085,7 +1090,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
 	else
 		address = *cpa->vaddr;
 repeat:
-	kpte = lookup_address(address, &level);
+	kpte = _lookup_address_cpa(cpa, address, &level);
 	if (!kpte)
 		return __cpa_process_fault(cpa, address, primary);
 
@@ -1149,7 +1154,7 @@ repeat:
 	/*
 	 * We have to split the large page:
 	 */
-	err = split_large_page(kpte, address);
+	err = split_large_page(cpa, kpte, address);
 	if (!err) {
 		/*
 	 	 * Do a global flush tlb after splitting the large page
@@ -1298,6 +1303,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	int ret, cache, checkalias;
 	unsigned long baddr = 0;
 
+	memset(&cpa, 0, sizeof(cpa));
+
 	/*
 	 * Check, if we are requested to change a not supported
 	 * feature:
@@ -1744,6 +1751,7 @@ static int __set_pages_p(struct page *page, int numpages)
 {
 	unsigned long tempaddr = (unsigned long) page_address(page);
 	struct cpa_data cpa = { .vaddr = &tempaddr,
+				.pgd = NULL,
 				.numpages = numpages,
 				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
 				.mask_clr = __pgprot(0),
@@ -1762,6 +1770,7 @@ static int __set_pages_np(struct page *page, int numpages)
 {
 	unsigned long tempaddr = (unsigned long) page_address(page);
 	struct cpa_data cpa = { .vaddr = &tempaddr,
+				.pgd = NULL,
 				.numpages = numpages,
 				.mask_set = __pgprot(0),
 				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@@ -1822,6 +1831,36 @@ bool kernel_page_present(struct page *page)
 
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+			    unsigned numpages, unsigned long page_flags)
+{
+	int retval = -EINVAL;
+
+	struct cpa_data cpa = {
+		.vaddr = &address,
+		.pfn = pfn,
+		.pgd = pgd,
+		.numpages = numpages,
+		.mask_set = __pgprot(0),
+		.mask_clr = __pgprot(0),
+		.flags = 0,
+	};
+
+	if (!(__supported_pte_mask & _PAGE_NX))
+		goto out;
+
+	if (!(page_flags & _PAGE_NX))
+		cpa.mask_clr = __pgprot(_PAGE_NX);
+
+	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+	retval = __change_page_attr_set_clr(&cpa, 0);
+	__flush_tlb_all();
+
+out:
+	return retval;
+}
+
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.
-- 
cgit v1.1


From d2f7cbe7b26a74dbbbf8f325b2a6fd01bc34032c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:08 +0100
Subject: x86/efi: Runtime services virtual mapping

We map the EFI regions needed for runtime services non-contiguously,
with preserved alignment on virtual addresses starting from -4G down
for a total max space of 64G. This way, we provide for stable runtime
services addresses across kernels so that a kexec'd kernel can still use
them.

Thus, they're mapped in a separate pagetable so that we don't pollute
the kernel namespace.

Add an efi= kernel command line parameter for passing miscellaneous
options and chicken bits from the command line.

While at it, add a chicken bit called "efi=old_map" which can be used as
a fallback to the old runtime services mapping method in case there's
some b0rkage with a particular EFI implementation (haha, it is hard to
hold up the sarcasm here...).

Also, add the UEFI RT VA space to Documentation/x86/x86_64/mm.txt.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/include/asm/efi.h           |  64 ++++++++++++++------
 arch/x86/include/asm/pgtable_types.h |   3 +-
 arch/x86/platform/efi/efi.c          |  94 +++++++++++++++++++++---------
 arch/x86/platform/efi/efi_32.c       |   9 ++-
 arch/x86/platform/efi/efi_64.c       | 109 +++++++++++++++++++++++++++++++++++
 arch/x86/platform/efi/efi_stub_64.S  |  54 +++++++++++++++++
 6 files changed, 286 insertions(+), 47 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 65c6e6e..89a05b0 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,6 +1,24 @@
 #ifndef _ASM_X86_EFI_H
 #define _ASM_X86_EFI_H
 
+/*
+ * We map the EFI regions needed for runtime services non-contiguously,
+ * with preserved alignment on virtual addresses starting from -4G down
+ * for a total max space of 64G. This way, we provide for stable runtime
+ * services addresses across kernels so that a kexec'd kernel can still
+ * use them.
+ *
+ * This is the main reason why we're doing stable VA mappings for RT
+ * services.
+ *
+ * This flag is used in conjuction with a chicken bit called
+ * "efi=old_map" which can be used as a fallback to the old runtime
+ * services mapping method in case there's some b0rkage with a
+ * particular EFI implementation (haha, it is hard to hold up the
+ * sarcasm here...).
+ */
+#define EFI_OLD_MEMMAP		EFI_ARCH_1
+
 #ifdef CONFIG_X86_32
 
 #define EFI_LOADER_SIGNATURE	"EL32"
@@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
 	efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3),		\
 		  (u64)(a4), (u64)(a5), (u64)(a6))
 
+#define _efi_call_virtX(x, f, ...)					\
+({									\
+	efi_status_t __s;						\
+									\
+	efi_sync_low_kernel_mappings();					\
+	preempt_disable();						\
+	__s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__);	\
+	preempt_enable();						\
+	__s;								\
+})
+
 #define efi_call_virt0(f)				\
-	efi_call0((efi.systab->runtime->f))
-#define efi_call_virt1(f, a1)					\
-	efi_call1((efi.systab->runtime->f), (u64)(a1))
-#define efi_call_virt2(f, a1, a2)					\
-	efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2))
-#define efi_call_virt3(f, a1, a2, a3)					\
-	efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3))
-#define efi_call_virt4(f, a1, a2, a3, a4)				\
-	efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3), (u64)(a4))
-#define efi_call_virt5(f, a1, a2, a3, a4, a5)				\
-	efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3), (u64)(a4), (u64)(a5))
-#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)			\
-	efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+	_efi_call_virtX(0, f)
+#define efi_call_virt1(f, a1)				\
+	_efi_call_virtX(1, f, (u64)(a1))
+#define efi_call_virt2(f, a1, a2)			\
+	_efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
+#define efi_call_virt3(f, a1, a2, a3)			\
+	_efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
+#define efi_call_virt4(f, a1, a2, a3, a4)		\
+	_efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
+#define efi_call_virt5(f, a1, a2, a3, a4, a5)		\
+	_efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
+	_efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
 
 extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
 				 u32 type, u64 attribute);
@@ -95,12 +120,17 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
 
 extern int add_efi_memmap;
 extern unsigned long x86_efi_facility;
+extern struct efi_scratch efi_scratch;
 extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
 extern int efi_memblock_x86_reserve_range(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
 extern void efi_unmap_memmap(void);
 extern void efi_memory_uc(u64 addr, unsigned long size);
+extern void __init efi_map_region(efi_memory_desc_t *md);
+extern void efi_sync_low_kernel_mappings(void);
+extern void efi_setup_page_tables(void);
+extern void __init old_map_region(efi_memory_desc_t *md);
 
 #ifdef CONFIG_EFI
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f4843e0..028e28b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -379,7 +379,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
 extern phys_addr_t slow_virt_to_phys(void *__address);
-
+extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+				   unsigned numpages, unsigned long page_flags);
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f396163..b453069 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -12,6 +12,8 @@
  *	Bibo Mao <bibo.mao@intel.com>
  *	Chandramouli Narayanan <mouli@linux.intel.com>
  *	Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013 SuSE Labs
+ *	Borislav Petkov <bp@suse.de> - runtime services VA mapping
  *
  * Copied from efi_32.c to eliminate the duplicated code between EFI
  * 32/64 support code. --ying 2007-10-26
@@ -745,21 +747,56 @@ void efi_memory_uc(u64 addr, unsigned long size)
 	set_memory_uc(addr, npages);
 }
 
+void __init old_map_region(efi_memory_desc_t *md)
+{
+	u64 start_pfn, end_pfn, end;
+	unsigned long size;
+	void *va;
+
+	start_pfn = PFN_DOWN(md->phys_addr);
+	size	  = md->num_pages << PAGE_SHIFT;
+	end	  = md->phys_addr + size;
+	end_pfn   = PFN_UP(end);
+
+	if (pfn_range_is_mapped(start_pfn, end_pfn)) {
+		va = __va(md->phys_addr);
+
+		if (!(md->attribute & EFI_MEMORY_WB))
+			efi_memory_uc((u64)(unsigned long)va, size);
+	} else
+		va = efi_ioremap(md->phys_addr, size,
+				 md->type, md->attribute);
+
+	md->virt_addr = (u64) (unsigned long) va;
+	if (!va)
+		pr_err("ioremap of 0x%llX failed!\n",
+		       (unsigned long long)md->phys_addr);
+}
+
 /*
  * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
+ * Essentially, we look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor into the
+ * ->trampoline_pgd page table using a top-down VA allocation scheme.
+ *
+ * The old method which used to update that memory descriptor with the
+ * virtual address obtained from ioremap() is still supported when the
+ * kernel is booted with efi=old_map on its command line. Same old
+ * method enabled the runtime services to be called without having to
  * thunk back into physical mode for every invocation.
+ *
+ * The new method does a pagetable switch in a preemption-safe manner
+ * so that we're in a different address space when calling a runtime
+ * function. For function arguments passing we do copy the PGDs of the
+ * kernel page table into ->trampoline_pgd prior to each call.
  */
 void __init efi_enter_virtual_mode(void)
 {
 	efi_memory_desc_t *md, *prev_md = NULL;
-	efi_status_t status;
+	void *p, *new_memmap = NULL;
 	unsigned long size;
-	u64 end, systab, start_pfn, end_pfn;
-	void *p, *va, *new_memmap = NULL;
+	efi_status_t status;
+	u64 end, systab;
 	int count = 0;
 
 	efi.systab = NULL;
@@ -768,7 +805,6 @@ void __init efi_enter_virtual_mode(void)
 	 * We don't do virtual mode, since we don't do runtime services, on
 	 * non-native EFI
 	 */
-
 	if (!efi_is_native()) {
 		efi_unmap_memmap();
 		return;
@@ -799,6 +835,7 @@ void __init efi_enter_virtual_mode(void)
 			continue;
 		}
 		prev_md = md;
+
 	}
 
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -808,33 +845,18 @@ void __init efi_enter_virtual_mode(void)
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
 
+		efi_map_region(md);
+
 		size = md->num_pages << EFI_PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		start_pfn = PFN_DOWN(md->phys_addr);
-		end_pfn = PFN_UP(end);
-		if (pfn_range_is_mapped(start_pfn, end_pfn)) {
-			va = __va(md->phys_addr);
-
-			if (!(md->attribute & EFI_MEMORY_WB))
-				efi_memory_uc((u64)(unsigned long)va, size);
-		} else
-			va = efi_ioremap(md->phys_addr, size,
-					 md->type, md->attribute);
-
-		md->virt_addr = (u64) (unsigned long) va;
-
-		if (!va) {
-			pr_err("ioremap of 0x%llX failed!\n",
-			       (unsigned long long)md->phys_addr);
-			continue;
-		}
-
 		systab = (u64) (unsigned long) efi_phys.systab;
 		if (md->phys_addr <= systab && systab < end) {
 			systab += md->virt_addr - md->phys_addr;
+
 			efi.systab = (efi_system_table_t *) (unsigned long) systab;
 		}
+
 		new_memmap = krealloc(new_memmap,
 				      (count + 1) * memmap.desc_size,
 				      GFP_KERNEL);
@@ -845,6 +867,9 @@ void __init efi_enter_virtual_mode(void)
 
 	BUG_ON(!efi.systab);
 
+	efi_setup_page_tables();
+	efi_sync_low_kernel_mappings();
+
 	status = phys_efi_set_virtual_address_map(
 		memmap.desc_size * count,
 		memmap.desc_size,
@@ -877,7 +902,8 @@ void __init efi_enter_virtual_mode(void)
 	efi.query_variable_info = virt_efi_query_variable_info;
 	efi.update_capsule = virt_efi_update_capsule;
 	efi.query_capsule_caps = virt_efi_query_capsule_caps;
-	if (__supported_pte_mask & _PAGE_NX)
+
+	if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
 		runtime_code_page_mkexec();
 
 	kfree(new_memmap);
@@ -1007,3 +1033,15 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
 	return EFI_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(efi_query_variable_store);
+
+static int __init parse_efi_cmdline(char *str)
+{
+	if (*str == '=')
+		str++;
+
+	if (!strncmp(str, "old_map", 7))
+		set_bit(EFI_OLD_MEMMAP, &x86_efi_facility);
+
+	return 0;
+}
+early_param("efi", parse_efi_cmdline);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e4469..e94557c 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -37,9 +37,16 @@
  * claim EFI runtime service handler exclusively and to duplicate a memory in
  * low memory space say 0 - 3G.
  */
-
 static unsigned long efi_rt_eflags;
 
+void efi_sync_low_kernel_mappings(void) {}
+void efi_setup_page_tables(void) {}
+
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+	old_map_region(md);
+}
+
 void efi_call_phys_prelog(void)
 {
 	struct desc_ptr gdt_descr;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 39a0e7f..bf286c3 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -38,10 +38,28 @@
 #include <asm/efi.h>
 #include <asm/cacheflush.h>
 #include <asm/fixmap.h>
+#include <asm/realmode.h>
 
 static pgd_t *save_pgd __initdata;
 static unsigned long efi_flags __initdata;
 
+/*
+ * We allocate runtime services regions bottom-up, starting from -4G, i.e.
+ * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
+ */
+static u64 efi_va	= -4 * (1UL << 30);
+#define EFI_VA_END	(-68 * (1UL << 30))
+
+/*
+ * Scratch space used for switching the pagetable in the EFI stub
+ */
+struct efi_scratch {
+	u64 r15;
+	u64 prev_cr3;
+	pgd_t *efi_pgt;
+	bool use_pgd;
+};
+
 static void __init early_code_mapping_set_exec(int executable)
 {
 	efi_memory_desc_t *md;
@@ -65,6 +83,9 @@ void __init efi_call_phys_prelog(void)
 	int pgd;
 	int n_pgds;
 
+	if (!efi_enabled(EFI_OLD_MEMMAP))
+		return;
+
 	early_code_mapping_set_exec(1);
 	local_irq_save(efi_flags);
 
@@ -86,6 +107,10 @@ void __init efi_call_phys_epilog(void)
 	 */
 	int pgd;
 	int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+
+	if (!efi_enabled(EFI_OLD_MEMMAP))
+		return;
+
 	for (pgd = 0; pgd < n_pgds; pgd++)
 		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
 	kfree(save_pgd);
@@ -94,6 +119,90 @@ void __init efi_call_phys_epilog(void)
 	early_code_mapping_set_exec(0);
 }
 
+/*
+ * Add low kernel mappings for passing arguments to EFI functions.
+ */
+void efi_sync_low_kernel_mappings(void)
+{
+	unsigned num_pgds;
+	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+
+	if (efi_enabled(EFI_OLD_MEMMAP))
+		return;
+
+	num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
+
+	memcpy(pgd + pgd_index(PAGE_OFFSET),
+		init_mm.pgd + pgd_index(PAGE_OFFSET),
+		sizeof(pgd_t) * num_pgds);
+}
+
+void efi_setup_page_tables(void)
+{
+	efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
+
+	if (!efi_enabled(EFI_OLD_MEMMAP))
+		efi_scratch.use_pgd = true;
+}
+
+static void __init __map_region(efi_memory_desc_t *md, u64 va)
+{
+	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+	unsigned long pf = 0, size;
+	u64 end;
+
+	if (!(md->attribute & EFI_MEMORY_WB))
+		pf |= _PAGE_PCD;
+
+	size = md->num_pages << PAGE_SHIFT;
+	end  = va + size;
+
+	if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
+		pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+			   md->phys_addr, va);
+}
+
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+	unsigned long size = md->num_pages << PAGE_SHIFT;
+	u64 pa = md->phys_addr;
+
+	if (efi_enabled(EFI_OLD_MEMMAP))
+		return old_map_region(md);
+
+	/*
+	 * Make sure the 1:1 mappings are present as a catch-all for b0rked
+	 * firmware which doesn't update all internal pointers after switching
+	 * to virtual mode and would otherwise crap on us.
+	 */
+	__map_region(md, md->phys_addr);
+
+	efi_va -= size;
+
+	/* Is PA 2M-aligned? */
+	if (!(pa & (PMD_SIZE - 1))) {
+		efi_va &= PMD_MASK;
+	} else {
+		u64 pa_offset = pa & (PMD_SIZE - 1);
+		u64 prev_va = efi_va;
+
+		/* get us the same offset within this 2M page */
+		efi_va = (efi_va & PMD_MASK) + pa_offset;
+
+		if (efi_va > prev_va)
+			efi_va -= PMD_SIZE;
+	}
+
+	if (efi_va < EFI_VA_END) {
+		pr_warn(FW_WARN "VA address range overflow!\n");
+		return;
+	}
+
+	/* Do the VA map */
+	__map_region(md, efi_va);
+	md->virt_addr = efi_va;
+}
+
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
 				 u32 type, u64 attribute)
 {
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 4c07cca..88073b1 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -34,10 +34,47 @@
 	mov %rsi, %cr0;			\
 	mov (%rsp), %rsp
 
+	/* stolen from gcc */
+	.macro FLUSH_TLB_ALL
+	movq %r15, efi_scratch(%rip)
+	movq %r14, efi_scratch+8(%rip)
+	movq %cr4, %r15
+	movq %r15, %r14
+	andb $0x7f, %r14b
+	movq %r14, %cr4
+	movq %r15, %cr4
+	movq efi_scratch+8(%rip), %r14
+	movq efi_scratch(%rip), %r15
+	.endm
+
+	.macro SWITCH_PGT
+	cmpb $0, efi_scratch+24(%rip)
+	je 1f
+	movq %r15, efi_scratch(%rip)		# r15
+	# save previous CR3
+	movq %cr3, %r15
+	movq %r15, efi_scratch+8(%rip)		# prev_cr3
+	movq efi_scratch+16(%rip), %r15		# EFI pgt
+	movq %r15, %cr3
+	1:
+	.endm
+
+	.macro RESTORE_PGT
+	cmpb $0, efi_scratch+24(%rip)
+	je 2f
+	movq efi_scratch+8(%rip), %r15
+	movq %r15, %cr3
+	movq efi_scratch(%rip), %r15
+	FLUSH_TLB_ALL
+	2:
+	.endm
+
 ENTRY(efi_call0)
 	SAVE_XMM
 	subq $32, %rsp
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -47,7 +84,9 @@ ENTRY(efi_call1)
 	SAVE_XMM
 	subq $32, %rsp
 	mov  %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -57,7 +96,9 @@ ENTRY(efi_call2)
 	SAVE_XMM
 	subq $32, %rsp
 	mov  %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -68,7 +109,9 @@ ENTRY(efi_call3)
 	subq $32, %rsp
 	mov  %rcx, %r8
 	mov  %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -80,7 +123,9 @@ ENTRY(efi_call4)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $32, %rsp
 	RESTORE_XMM
 	ret
@@ -93,7 +138,9 @@ ENTRY(efi_call5)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $48, %rsp
 	RESTORE_XMM
 	ret
@@ -109,8 +156,15 @@ ENTRY(efi_call6)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
+	SWITCH_PGT
 	call *%rdi
+	RESTORE_PGT
 	addq $48, %rsp
 	RESTORE_XMM
 	ret
 ENDPROC(efi_call6)
+
+	.data
+ENTRY(efi_scratch)
+	.fill 3,8,0
+	.byte 0
-- 
cgit v1.1


From ee41143027706d9f342dfe05487a00b20887fde7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 31 Oct 2013 17:25:09 +0100
Subject: x86/efi: Check krealloc return value

Check it just in case. We might just as well panic there because runtime
won't be functioning anyway.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index b453069..3fac4de 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -860,6 +860,9 @@ void __init efi_enter_virtual_mode(void)
 		new_memmap = krealloc(new_memmap,
 				      (count + 1) * memmap.desc_size,
 				      GFP_KERNEL);
+		if (!new_memmap)
+			goto err_out;
+
 		memcpy(new_memmap + (count * memmap.desc_size), md,
 		       memmap.desc_size);
 		count++;
@@ -914,6 +917,11 @@ void __init efi_enter_virtual_mode(void)
 			 EFI_VARIABLE_BOOTSERVICE_ACCESS |
 			 EFI_VARIABLE_RUNTIME_ACCESS,
 			 0, NULL);
+
+	return;
+
+ err_out:
+	pr_err("Error reallocating memory, EFI runtime non-functional!\n");
 }
 
 /*
-- 
cgit v1.1


From a653f3563c51c7bb7de63d607bef09d3baddaeb8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 11 Nov 2013 14:28:39 -0800
Subject: x86, kaslr: Mix entropy sources together as needed

Depending on availability, mix the RDRAND and RDTSC entropy together with
XOR. Only when neither is available should the i8254 be used. Update
the Kconfig documentation to reflect this. Additionally, since bits
used for entropy is masked elsewhere, drop the needless masking in
the get_random_long(). Similarly, use the entire TSC, not just the low
32 bits.

Finally, to improve the starting entropy, do a simple hashing of a
build-time versions string and the boot-time boot_params structure for
some additional level of unpredictability.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/20131111222839.GA28616@www.outflux.net
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                | 14 +++++---
 arch/x86/boot/compressed/aslr.c | 73 +++++++++++++++++++++++++++++++----------
 2 files changed, 65 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51f4399..596cd9e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1735,13 +1735,17 @@ config RANDOMIZE_BASE
 	   deters exploit attempts relying on knowledge of the location
 	   of kernel internals.
 
-	   Entropy is generated using the RDRAND instruction if it
-	   is supported.  If not, then RDTSC is used, if supported. If
-	   neither RDRAND nor RDTSC are supported, then no randomness
-	   is introduced.
+	   Entropy is generated using the RDRAND instruction if it is
+	   supported. If RDTSC is supported, it is used as well. If
+	   neither RDRAND nor RDTSC are supported, then randomness is
+	   read from the i8254 timer.
 
 	   The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
-	   and aligned according to PHYSICAL_ALIGN.
+	   and aligned according to PHYSICAL_ALIGN. Since the kernel is
+	   built using 2GiB addressing, and PHYSICAL_ALGIN must be at a
+	   minimum of 2MiB, only 10 bits of entropy is theoretically
+	   possible. At best, due to page table layouts, 64-bit can use
+	   9 bits of entropy and 32-bit uses 8 bits.
 
 config RANDOMIZE_BASE_MAX_OFFSET
 	hex "Maximum ASLR offset allowed"
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 0595798..8746487 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -5,6 +5,17 @@
 #include <asm/archrandom.h>
 #include <asm/e820.h>
 
+#include <generated/compile.h>
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <generated/utsrelease.h>
+#include <linux/version.h>
+
+/* Simplified build-specific string for starting entropy. */
+static const char *build_str = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
+
 #define I8254_PORT_CONTROL	0x43
 #define I8254_PORT_COUNTER0	0x40
 #define I8254_CMD_READBACK	0xC0
@@ -25,34 +36,62 @@ static inline u16 i8254(void)
 	return timer;
 }
 
+static unsigned long rotate_xor(unsigned long hash, const void *area,
+				size_t size)
+{
+	size_t i;
+	unsigned long *ptr = (unsigned long *)area;
+
+	for (i = 0; i < size / sizeof(hash); i++) {
+		/* Rotate by odd number of bits and XOR. */
+		hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+		hash ^= ptr[i];
+	}
+
+	return hash;
+}
+
+/* Attempt to create a simple but unpredictable starting entropy. */
+static unsigned long get_random_boot(void)
+{
+	unsigned long hash = 0;
+
+	hash = rotate_xor(hash, build_str, sizeof(build_str));
+	hash = rotate_xor(hash, real_mode, sizeof(*real_mode));
+
+	return hash;
+}
+
 static unsigned long get_random_long(void)
 {
-	unsigned long random;
+	unsigned long raw, random = get_random_boot();
+	bool use_i8254 = true;
+
+	debug_putstr("KASLR using");
 
 	if (has_cpuflag(X86_FEATURE_RDRAND)) {
-		debug_putstr("KASLR using RDRAND...\n");
-		if (rdrand_long(&random))
-			return random;
+		debug_putstr(" RDRAND");
+		if (rdrand_long(&raw)) {
+			random ^= raw;
+			use_i8254 = false;
+		}
 	}
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
-		uint32_t raw;
+		debug_putstr(" RDTSC");
+		rdtscll(raw);
 
-		debug_putstr("KASLR using RDTSC...\n");
-		rdtscl(raw);
+		random ^= raw;
+		use_i8254 = false;
+	}
 
-		/* Only use the low bits of rdtsc. */
-		random = raw & 0xffff;
-	} else {
-		debug_putstr("KASLR using i8254...\n");
-		random = i8254();
+	if (use_i8254) {
+		debug_putstr(" i8254");
+		random ^= i8254();
 	}
 
-	/* Extend timer bits poorly... */
-	random |= (random << 16);
-#ifdef CONFIG_X86_64
-	random |= (random << 32);
-#endif
+	debug_putstr("...\n");
+
 	return random;
 }
 
-- 
cgit v1.1


From e8236c4d9338d52d0f2fcecc0b792ac0542e4ee9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 Nov 2013 22:45:20 -0800
Subject: x86, kaslr: Add a circular multiply for better bit diffusion

If we don't have RDRAND (in which case nothing else *should* matter),
most sources have a highly biased entropy distribution.  Use a
circular multiply to diffuse the entropic bits.  A circular multiply
is a good operation for this: it is cheap on standard hardware and
because it is symmetric (unlike an ordinary multiply) it doesn't
introduce its own bias.

Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20131111222839.GA28616@www.outflux.net
---
 arch/x86/boot/compressed/aslr.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 8746487..38a07cc 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -64,6 +64,11 @@ static unsigned long get_random_boot(void)
 
 static unsigned long get_random_long(void)
 {
+#ifdef CONFIG_X86_64
+	const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
+#else
+	const unsigned long mix_const = 0x3f39e593UL;
+#endif
 	unsigned long raw, random = get_random_boot();
 	bool use_i8254 = true;
 
@@ -90,6 +95,12 @@ static unsigned long get_random_long(void)
 		random ^= i8254();
 	}
 
+	/* Circular multiply for better bit diffusion */
+	asm("mul %3"
+	    : "=a" (random), "=d" (raw)
+	    : "a" (random), "rm" (mix_const));
+	random += raw;
+
 	debug_putstr("...\n");
 
 	return random;
-- 
cgit v1.1


From 327f7d72454aecdc7a4a1c847a291a3f224b730f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 12 Nov 2013 08:56:07 -0800
Subject: x86, kaslr: Use char array to gain sizeof sanity

The build_str needs to be char [] not char * for the sizeof() to report
the string length.

Reported-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/20131112165607.GA5921@www.outflux.net
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/aslr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 38a07cc..84be175 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -13,7 +13,7 @@
 #include <linux/version.h>
 
 /* Simplified build-specific string for starting entropy. */
-static const char *build_str = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
 		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
 
 #define I8254_PORT_CONTROL	0x43
-- 
cgit v1.1


From f4cb1cc18f364d761d5614eb6293cccc6647f259 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Sat, 16 Nov 2013 12:37:01 -0800
Subject: x86-64, copy_user: Remove zero byte check before copy user buffer.

Operation of rep movsb instruction handles zero byte copy. As pointed out by
Linus, there is no need to check zero size in kernel. Removing this redundant
check saves a few cycles in copy user functions.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1384634221-6006-1-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/copy_user_64.S | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a30ca15..ffe4eb9 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -236,8 +236,6 @@ ENDPROC(copy_user_generic_unrolled)
 ENTRY(copy_user_generic_string)
 	CFI_STARTPROC
 	ASM_STAC
-	andl %edx,%edx
-	jz 4f
 	cmpl $8,%edx
 	jb 2f		/* less than 8 bytes, go to byte copy loop */
 	ALIGN_DESTINATION
@@ -249,7 +247,7 @@ ENTRY(copy_user_generic_string)
 2:	movl %edx,%ecx
 3:	rep
 	movsb
-4:	xorl %eax,%eax
+	xorl %eax,%eax
 	ASM_CLAC
 	ret
 
@@ -279,12 +277,10 @@ ENDPROC(copy_user_generic_string)
 ENTRY(copy_user_enhanced_fast_string)
 	CFI_STARTPROC
 	ASM_STAC
-	andl %edx,%edx
-	jz 2f
 	movl %edx,%ecx
 1:	rep
 	movsb
-2:	xorl %eax,%eax
+	xorl %eax,%eax
 	ASM_CLAC
 	ret
 
-- 
cgit v1.1


From 5305ca10e7f44333e46c1ce57fb87306cb437891 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 15 Nov 2013 14:14:00 -0800
Subject: x86/mm: Unify pte_to_pgoff() and pgoff_to_pte() helpers

Use unified pte_bitop() helper to manipulate bits in pte/pgoff
bitfields and convert pte_to_pgoff()/pgoff_to_pte() to inlines.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable-2level.h | 100 ++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 3bf2dd0..0d193e2 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -55,6 +55,13 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
+/* Bit manipulation helper on pte/pgoff entry */
+static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
+				      unsigned long mask, unsigned int leftshift)
+{
+	return ((value >> rightshift) & mask) << leftshift;
+}
+
 #ifdef CONFIG_MEM_SOFT_DIRTY
 
 /*
@@ -71,31 +78,34 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
 #define PTE_FILE_BITS3		(PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
 
-#define pte_to_pgoff(pte)						\
-	((((pte).pte_low >> (PTE_FILE_SHIFT1))				\
-	  & ((1U << PTE_FILE_BITS1) - 1)))				\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT2))			\
-	    & ((1U << PTE_FILE_BITS2) - 1))				\
-	   << (PTE_FILE_BITS1))						\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT3))			\
-	    & ((1U << PTE_FILE_BITS3) - 1))				\
-	   << (PTE_FILE_BITS1 + PTE_FILE_BITS2))			\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT4)))			\
-	    << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))
-
-#define pgoff_to_pte(off)						\
-	((pte_t) { .pte_low =						\
-	 ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1)	\
-	 + ((((off) >> PTE_FILE_BITS1)					\
-	     & ((1U << PTE_FILE_BITS2) - 1))				\
-	    << PTE_FILE_SHIFT2)						\
-	 + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2))		\
-	     & ((1U << PTE_FILE_BITS3) - 1))				\
-	    << PTE_FILE_SHIFT3)						\
-	 + ((((off) >>							\
-	      (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)))	\
-	    << PTE_FILE_SHIFT4)						\
-	 + _PAGE_FILE })
+#define PTE_FILE_MASK1		((1U << PTE_FILE_BITS1) - 1)
+#define PTE_FILE_MASK2		((1U << PTE_FILE_BITS2) - 1)
+#define PTE_FILE_MASK3		((1U << PTE_FILE_BITS3) - 1)
+
+#define PTE_FILE_LSHIFT2	(PTE_FILE_BITS1)
+#define PTE_FILE_LSHIFT3	(PTE_FILE_BITS1 + PTE_FILE_BITS2)
+#define PTE_FILE_LSHIFT4	(PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)
+
+static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
+{
+	return (pgoff_t)
+		(pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1,  0)		    +
+		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2,  PTE_FILE_LSHIFT2) +
+		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3,  PTE_FILE_LSHIFT3) +
+		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT4,           -1UL,  PTE_FILE_LSHIFT4));
+}
+
+static __always_inline pte_t pgoff_to_pte(pgoff_t off)
+{
+	return (pte_t){
+		.pte_low =
+			pte_bitop(off,                0, PTE_FILE_MASK1,  PTE_FILE_SHIFT1) +
+			pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2,  PTE_FILE_SHIFT2) +
+			pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3,  PTE_FILE_SHIFT3) +
+			pte_bitop(off, PTE_FILE_LSHIFT4,           -1UL,  PTE_FILE_SHIFT4) +
+			_PAGE_FILE,
+	};
+}
 
 #else /* CONFIG_MEM_SOFT_DIRTY */
 
@@ -115,22 +125,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
 #define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
 
-#define pte_to_pgoff(pte)						\
-	((((pte).pte_low >> PTE_FILE_SHIFT1)				\
-	  & ((1U << PTE_FILE_BITS1) - 1))				\
-	 + ((((pte).pte_low >> PTE_FILE_SHIFT2)				\
-	     & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1)		\
-	 + (((pte).pte_low >> PTE_FILE_SHIFT3)				\
-	    << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
-
-#define pgoff_to_pte(off)						\
-	((pte_t) { .pte_low =						\
-	 (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1)	\
-	 + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1))	\
-	    << PTE_FILE_SHIFT2)						\
-	 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2))		\
-	    << PTE_FILE_SHIFT3)						\
-	 + _PAGE_FILE })
+#define PTE_FILE_MASK1		((1U << PTE_FILE_BITS1) - 1)
+#define PTE_FILE_MASK2		((1U << PTE_FILE_BITS2) - 1)
+
+#define PTE_FILE_LSHIFT2	(PTE_FILE_BITS1)
+#define PTE_FILE_LSHIFT3	(PTE_FILE_BITS1 + PTE_FILE_BITS2)
+
+static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
+{
+	return (pgoff_t)
+		(pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1,  0)		    +
+		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2,  PTE_FILE_LSHIFT2) +
+		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3,           -1UL,  PTE_FILE_LSHIFT3));
+}
+
+static __always_inline pte_t pgoff_to_pte(pgoff_t off)
+{
+	return (pte_t){
+		.pte_low =
+			pte_bitop(off,                0, PTE_FILE_MASK1,  PTE_FILE_SHIFT1) +
+			pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2,  PTE_FILE_SHIFT2) +
+			pte_bitop(off, PTE_FILE_LSHIFT3,           -1UL,  PTE_FILE_SHIFT3) +
+			_PAGE_FILE,
+	};
+}
 
 #endif /* CONFIG_MEM_SOFT_DIRTY */
 
-- 
cgit v1.1


From fd8526ad14c182605e42b64646344b95befd9f94 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 19 Nov 2013 15:17:50 +0200
Subject: x86/mm: Implement ASLR for hugetlb mappings

Matthew noticed that hugetlb mappings don't participate in ASLR on x86-64:

  %  for i in `seq 3`; do
  > tools/testing/selftests/vm/map_hugetlb | grep address
  > done
  Returned address is 0x2aaaaac00000
  Returned address is 0x2aaaaac00000
  Returned address is 0x2aaaaac00000

/proc/PID/maps entries for the mapping are always the same
(except inode number):

  2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 8200              /anon_hugepage (deleted)
  2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 256               /anon_hugepage (deleted)
  2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 7180              /anon_hugepage (deleted)

The reason is the generic hugetlb_get_unmapped_area() function
which is used on x86-64.  It doesn't support randomization and
use bottom-up unmapped area lookup, instead of usual top-down
on x86-64.

x86 has arch-specific hugetlb_get_unmapped_area(), but it's used
only on x86-32.

Let's use arch-specific hugetlb_get_unmapped_area() on x86-64
too. That adds ASLR and switches hugetlb mappings to use top-down
unmapped area lookup:

  %  for i in `seq 3`; do
  > tools/testing/selftests/vm/map_hugetlb | grep address
  > done
  Returned address is 0x7f4f08a00000
  Returned address is 0x7fdda4200000
  Returned address is 0x7febe0000000

/proc/PID/maps entries:

  7f4f08a00000-7f4f18a00000 rw-p 00000000 00:0c 1168              /anon_hugepage (deleted)
  7fdda4200000-7fddb4200000 rw-p 00000000 00:0c 7092              /anon_hugepage (deleted)
  7febe0000000-7febf0000000 rw-p 00000000 00:0c 7183              /anon_hugepage (deleted)

Unmapped area lookup policy for hugetlb mappings is consistent
with normal mappings now -- the only difference is alignment
requirements for huge pages.

libhugetlbfs test-suite didn't detect any regressions with the
patch applied (although it shows few failures on my machine
regardless the patch).

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mel Gorman <mgorman@suse.de>
Link: http://lkml.kernel.org/r/20131119131750.EA45CE0090@blue.fi.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page.h    | 1 +
 arch/x86/include/asm/page_32.h | 4 ----
 arch/x86/mm/hugetlbpage.c      | 9 +++------
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index c878924..775873d 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -71,6 +71,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
 #include <asm-generic/getorder.h>
 
 #define __HAVE_ARCH_GATE_AREA 1
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 
 #endif	/* __KERNEL__ */
 #endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 4d550d0..904f528 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -5,10 +5,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_HUGETLB_PAGE
-#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-#endif
-
 #define __phys_addr_nodebug(x)	((x) - PAGE_OFFSET)
 #ifdef CONFIG_DEBUG_VIRTUAL
 extern unsigned long __phys_addr(unsigned long);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 9d980d8..8c9f647 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -87,9 +87,7 @@ int pmd_huge_support(void)
 }
 #endif
 
-/* x86_64 also uses this file */
-
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#ifdef CONFIG_HUGETLB_PAGE
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 		unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
@@ -99,7 +97,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 
 	info.flags = 0;
 	info.length = len;
-	info.low_limit = TASK_UNMAPPED_BASE;
+	info.low_limit = current->mm->mmap_legacy_base;
 	info.high_limit = TASK_SIZE;
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
@@ -172,8 +170,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return hugetlb_get_unmapped_area_topdown(file, addr, len,
 				pgoff, flags);
 }
-
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef CONFIG_X86_64
 static __init int setup_hugepagesz(char *opt)
-- 
cgit v1.1


From 661c80192d21269c7fc566f1d547510b0c867677 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 20 Nov 2013 12:50:51 -0800
Subject: x86-64, copy_user: Use leal to produce 32-bit results

When we are using lea to produce a 32-bit result, we can use the leal
form, rather than using leaq and worry about truncation elsewhere.

Make the leal explicit, both to be more obvious and since that is what
gcc generates and thus is less likely to trigger obscure gas bugs.

Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1384634221-6006-1-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/copy_user_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index ffe4eb9..dee945d 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -186,7 +186,7 @@ ENTRY(copy_user_generic_unrolled)
 30:	shll $6,%ecx
 	addl %ecx,%edx
 	jmp 60f
-40:	lea (%rdx,%rcx,8),%rdx
+40:	leal (%rdx,%rcx,8),%edx
 	jmp 60f
 50:	movl %ecx,%edx
 60:	jmp copy_user_handle_tail /* ecx is zerorest also */
@@ -252,7 +252,7 @@ ENTRY(copy_user_generic_string)
 	ret
 
 	.section .fixup,"ax"
-11:	lea (%rdx,%rcx,8),%rcx
+11:	leal (%rdx,%rcx,8),%ecx
 12:	movl %ecx,%edx		/* ecx is zerorest also */
 	jmp copy_user_handle_tail
 	.previous
-- 
cgit v1.1


From 4788e5b4b2338f85fa42a712a182d8afd65d7c58 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 12 Nov 2013 17:58:50 +0100
Subject: perf/x86: Add Intel RAPL PMU support

This patch adds a new uncore PMU to expose the Intel
RAPL energy consumption counters. Up to 3 counters,
each counting a particular RAPL event are exposed.

The RAPL counters are available on Intel SandyBridge,
IvyBridge, Haswell. The server skus add a 3rd counter.

The following events are available and exposed in sysfs:

  - power/energy-cores: power consumption of all cores on socket
  - power/energy-pkg: power consumption of all cores + LLc cache
  - power/energy-dram: power consumption of DRAM (servers only)

For each event both the unit (Joules) and scale (2^-32 J)
is exposed in sysfs for use by perf stat and other tools.
The files are:

	/sys/devices/power/events/energy-*.unit
	/sys/devices/power/events/energy-*.scale

The RAPL PMU is uncore by nature and is implemented such
that it only works in system-wide mode. Measuring only
one CPU per socket is sufficient. The /sys/devices/power/cpumask
file can be used by tools to figure out which CPUs to monitor
by default. For instance, on a 2-socket system, 2 CPUs
(one on each socket) will be shown.

All the counters measure in the same unit (exposed via sysfs).
The perf_events API exposes all RAPL counters as 64-bit integers
counting in unit of 1/2^32 Joules (about 0.23 nJ). User level tools
must convert the counts by multiplying them by 2^-32 to obtain
Joules. The reason for this is that the kernel avoids
doing floating point math whenever possible because it is
expensive (user floating-point state must be saved). The method
used avoids kernel floating-point usage. There is no loss of
precision. Thanks to PeterZ for suggesting this approach.

To convert the raw count in Watt:
   W = C * 2.3 / (1e10 * time)
or ldexp(C, -32).

RAPL PMU is a new standalone PMU which registers with the
perf_event core subsystem. The PMU type (attr->type) is
dynamically allocated and is available from /sys/device/power/type.

Sampling is not supported by the RAPL PMU. There is no
privilege level filtering either.

Signed-off-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: zheng.z.yan@intel.com
Cc: bp@alien8.de
Link: http://lkml.kernel.org/r/1384275531-10892-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/Makefile                |   2 +-
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 591 ++++++++++++++++++++++++++++
 2 files changed, 592 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_rapl.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7..6359506 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o perf_event_intel_rapl.o
 endif
 
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 0000000..cfcd386
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,591 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ *  pp0 counter: consumption of all physical cores (power plane 0)
+ * 	  event: rapl_energy_cores
+ *    perf code: 0x1
+ *
+ *  pkg counter: consumption of the whole processor package
+ *	  event: rapl_energy_pkg
+ *    perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ *	  event: rapl_energy_dram
+ *    perf code: 0x3
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */
+#define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */
+#define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */
+#define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK	0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\
+static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,	\
+				char *page)			\
+{								\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
+	return sprintf(page, _format "\n");			\
+}								\
+static struct kobj_attribute format_attr_##_var =		\
+	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_EVENT_DESC(_name, _config)				\
+{								\
+	.attr	= __ATTR(_name, 0444, rapl_event_show, NULL),	\
+	.config	= _config,					\
+}
+
+#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+
+struct rapl_pmu {
+	spinlock_t	 lock;
+	int		 hw_unit;  /* 1/2^hw_unit Joule */
+	int		 n_active; /* number of active events */
+	struct list_head active_list;
+	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
+};
+
+static struct pmu rapl_pmu_class;
+static cpumask_t rapl_cpu_mask;
+static int rapl_cntr_mask;
+
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+	u64 raw;
+	rdmsrl(event->hw.event_base, raw);
+	return raw;
+}
+
+static inline u64 rapl_scale(u64 v)
+{
+	/*
+	 * scale delta to smallest unit (1/2^32)
+	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
+	 * or use ldexp(count, -32).
+	 * Watts = Joules/Time delta
+	 */
+	return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta, sdelta;
+	int shift = RAPL_CNTR_WIDTH;
+
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	rdmsrl(event->hw.event_base, new_raw_count);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			    new_raw_count) != prev_raw_count) {
+		cpu_relax();
+		goto again;
+	}
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	sdelta = rapl_scale(delta);
+
+	local64_add(sdelta, &event->count);
+
+	return new_raw_count;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+				   struct perf_event *event)
+{
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+
+	list_add_tail(&event->active_entry, &pmu->active_list);
+
+	local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+	pmu->n_active++;
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+	__rapl_pmu_event_start(pmu, event);
+	spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	/* mark event as deactivated and stopped */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		WARN_ON_ONCE(pmu->n_active <= 0);
+		pmu->n_active--;
+
+		list_del(&event->active_entry);
+
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	/* check if update of sw counter is necessary */
+	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		rapl_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_START)
+		__rapl_pmu_event_start(pmu, event);
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+	int bit, msr, ret = 0;
+
+	/* only look at RAPL events */
+	if (event->attr.type != rapl_pmu_class.type)
+		return -ENOENT;
+
+	/* check only supported bits are set */
+	if (event->attr.config & ~RAPL_EVENT_MASK)
+		return -EINVAL;
+
+	/*
+	 * check event is known (determines counter)
+	 */
+	switch (cfg) {
+	case INTEL_RAPL_PP0:
+		bit = RAPL_IDX_PP0_NRG_STAT;
+		msr = MSR_PP0_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_PKG:
+		bit = RAPL_IDX_PKG_NRG_STAT;
+		msr = MSR_PKG_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_RAM:
+		bit = RAPL_IDX_RAM_NRG_STAT;
+		msr = MSR_DRAM_ENERGY_STATUS;
+		break;
+	default:
+		return -EINVAL;
+	}
+	/* check event supported */
+	if (!(rapl_cntr_mask & (1 << bit)))
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	/* must be done before validate_group */
+	event->hw.event_base = msr;
+	event->hw.config = cfg;
+	event->hw.idx = bit;
+
+	return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+	rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
+
+	buf[n++] = '\n';
+	buf[n] = '\0';
+	return n;
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+	.attrs = rapl_pmu_attrs,
+};
+
+EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+EVENT_ATTR_STR(energy-pkg  , rapl_pkg, "event=0x02");
+EVENT_ATTR_STR(energy-ram  , rapl_ram, "event=0x03");
+
+EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+EVENT_ATTR_STR(energy-pkg.unit  , rapl_pkg_unit, "Joules");
+EVENT_ATTR_STR(energy-ram.unit  , rapl_ram_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+	.name = "events",
+	.attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+	.name = "format",
+	.attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+	&rapl_pmu_attr_group,
+	&rapl_pmu_format_group,
+	&rapl_pmu_events_group,
+	NULL,
+};
+
+static struct pmu rapl_pmu_class = {
+	.attr_groups	= rapl_attr_groups,
+	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
+	.event_init	= rapl_pmu_event_init,
+	.add		= rapl_pmu_event_add, /* must have */
+	.del		= rapl_pmu_event_del, /* must have */
+	.start		= rapl_pmu_event_start,
+	.stop		= rapl_pmu_event_stop,
+	.read		= rapl_pmu_event_read,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+	int i, phys_id = topology_physical_package_id(cpu);
+	int target = -1;
+
+	/* find a new cpu on same package */
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+		if (phys_id == topology_physical_package_id(i)) {
+			target = i;
+			break;
+		}
+	}
+	/*
+	 * clear cpu from cpumask
+	 * if was set in cpumask and still some cpu on package,
+	 * then move to new cpu
+	 */
+	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
+		cpumask_set_cpu(target, &rapl_cpu_mask);
+
+	WARN_ON(cpumask_empty(&rapl_cpu_mask));
+	/*
+	 * migrate events and context to new cpu
+	 */
+	if (target >= 0)
+		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+}
+
+static void rapl_cpu_init(int cpu)
+{
+	int i, phys_id = topology_physical_package_id(cpu);
+
+	/* check if phys_is is already covered */
+	for_each_cpu(i, &rapl_cpu_mask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;
+	}
+	/* was not found, so add it */
+	cpumask_set_cpu(cpu, &rapl_cpu_mask);
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+	int phys_id = topology_physical_package_id(cpu);
+
+	if (pmu)
+		return 0;
+
+	if (phys_id < 0)
+		return -1;
+
+	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+	if (!pmu)
+		return -1;
+
+	spin_lock_init(&pmu->lock);
+
+	INIT_LIST_HEAD(&pmu->active_list);
+
+	/*
+	 * grab power unit as: 1/2^unit Joules
+	 *
+	 * we cache in local PMU instance
+	 */
+	rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit);
+	pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL;
+	pmu->pmu = &rapl_pmu_class;
+
+	/* set RAPL pmu for this cpu for now */
+	per_cpu(rapl_pmu, cpu) = pmu;
+	per_cpu(rapl_pmu_to_free, cpu) = NULL;
+
+	return 0;
+}
+
+static void rapl_cpu_kfree(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
+
+	kfree(pmu);
+
+	per_cpu(rapl_pmu_to_free, cpu) = NULL;
+}
+
+static int rapl_cpu_dying(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+
+	if (!pmu)
+		return 0;
+
+	per_cpu(rapl_pmu, cpu) = NULL;
+
+	per_cpu(rapl_pmu_to_free, cpu) = pmu;
+
+	return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+			     unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		rapl_cpu_prepare(cpu);
+		break;
+	case CPU_STARTING:
+		rapl_cpu_init(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		rapl_cpu_dying(cpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		rapl_cpu_kfree(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		rapl_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] = {
+	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+	[1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+	struct rapl_pmu *pmu;
+	int cpu, ret;
+
+	/*
+	 * check for Intel processor family 6
+	 */
+	if (!x86_match_cpu(rapl_cpu_match))
+		return 0;
+
+	/* check supported CPU */
+	switch (boot_cpu_data.x86_model) {
+	case 42: /* Sandy Bridge */
+	case 58: /* Ivy Bridge */
+	case 60: /* Haswell */
+		rapl_cntr_mask = RAPL_IDX_CLN;
+		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+		break;
+	case 45: /* Sandy Bridge-EP */
+	case 62: /* IvyTown */
+		rapl_cntr_mask = RAPL_IDX_SRV;
+		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+		break;
+
+	default:
+		/* unsupported */
+		return 0;
+	}
+	get_online_cpus();
+
+	for_each_online_cpu(cpu) {
+		rapl_cpu_prepare(cpu);
+		rapl_cpu_init(cpu);
+	}
+
+	perf_cpu_notifier(rapl_cpu_notifier);
+
+	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+	if (WARN_ON(ret)) {
+		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+		put_online_cpus();
+		return -1;
+	}
+
+	pmu = __get_cpu_var(rapl_pmu);
+
+	pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+		" API unit is 2^-32 Joules,"
+		" %d fixed counters\n",
+		pmu->hw_unit,
+		hweight32(rapl_cntr_mask));
+
+	put_online_cpus();
+
+	return 0;
+}
+device_initcall(rapl_pmu_init);
-- 
cgit v1.1


From 65661f96d3b32f4b28fef26d21be81d7e173b965 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 12 Nov 2013 17:58:51 +0100
Subject: perf/x86: Add RAPL hrtimer support

The RAPL PMU counters do not interrupt on overflow.
Therefore, the kernel needs to poll the counters
to avoid missing an overflow. This patch adds
the hrtimer code to do this.

The timer interval is calculated at boot time
based on the power unit used by the HW.

There is one hrtimer per-cpu to handle the case
of multiple simultaneous use across cores on
the same package + hotplug CPU.

Thanks to Maria Dimakopoulou for her contributions
to this patch especially on the math aspects.

Signed-off-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
[ Applied 32-bit build fix. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: zheng.z.yan@intel.com
Cc: bp@alien8.de
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1384275531-10892-5-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 74 ++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index cfcd386..bf8e4a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -96,6 +96,8 @@ struct rapl_pmu {
 	int		 n_active; /* number of active events */
 	struct list_head active_list;
 	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
+	ktime_t		 timer_interval; /* in ktime_t unit */
+	struct hrtimer   hrtimer;
 };
 
 static struct pmu rapl_pmu_class;
@@ -158,6 +160,48 @@ again:
 	return new_raw_count;
 }
 
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+	__hrtimer_start_range_ns(&pmu->hrtimer,
+			pmu->timer_interval, 0,
+			HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
+{
+	hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+	struct perf_event *event;
+	unsigned long flags;
+
+	if (!pmu->n_active)
+		return HRTIMER_NORESTART;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	list_for_each_entry(event, &pmu->active_list, active_entry) {
+		rapl_event_update(event);
+	}
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+	return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+	struct hrtimer *hr = &pmu->hrtimer;
+
+	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hr->function = rapl_hrtimer_handle;
+}
+
 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 				   struct perf_event *event)
 {
@@ -171,6 +215,8 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
 
 	pmu->n_active++;
+	if (pmu->n_active == 1)
+		rapl_start_hrtimer(pmu);
 }
 
 static void rapl_pmu_event_start(struct perf_event *event, int mode)
@@ -195,6 +241,8 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 	if (!(hwc->state & PERF_HES_STOPPED)) {
 		WARN_ON_ONCE(pmu->n_active <= 0);
 		pmu->n_active--;
+		if (pmu->n_active == 0)
+			rapl_stop_hrtimer(pmu);
 
 		list_del(&event->active_entry);
 
@@ -423,6 +471,9 @@ static void rapl_cpu_exit(int cpu)
 	 */
 	if (target >= 0)
 		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+
+	/* cancel overflow polling timer for CPU */
+	rapl_stop_hrtimer(pmu);
 }
 
 static void rapl_cpu_init(int cpu)
@@ -442,6 +493,7 @@ static int rapl_cpu_prepare(int cpu)
 {
 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 	int phys_id = topology_physical_package_id(cpu);
+	u64 ms;
 
 	if (pmu)
 		return 0;
@@ -466,6 +518,22 @@ static int rapl_cpu_prepare(int cpu)
 	pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL;
 	pmu->pmu = &rapl_pmu_class;
 
+	/*
+	 * use reference of 200W for scaling the timeout
+	 * to avoid missing counter overflows.
+	 * 200W = 200 Joules/sec
+	 * divide interval by 2 to avoid lockstep (2 * 100)
+	 * if hw unit is 32, then we use 2 ms 1/200/2
+	 */
+	if (pmu->hw_unit < 32)
+		ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+	else
+		ms = 2;
+
+	pmu->timer_interval = ms_to_ktime(ms);
+
+	rapl_hrtimer_init(pmu);
+
 	/* set RAPL pmu for this cpu for now */
 	per_cpu(rapl_pmu, cpu) = pmu;
 	per_cpu(rapl_pmu_to_free, cpu) = NULL;
@@ -580,9 +648,11 @@ static int __init rapl_pmu_init(void)
 
 	pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
 		" API unit is 2^-32 Joules,"
-		" %d fixed counters\n",
+		" %d fixed counters"
+		" %llu ms ovfl timer\n",
 		pmu->hw_unit,
-		hweight32(rapl_cntr_mask));
+		hweight32(rapl_cntr_mask),
+		ktime_to_ms(pmu->timer_interval));
 
 	put_online_cpus();
 
-- 
cgit v1.1


From 853d9b18f1e861d37e9b271742329f8c1176eabe Mon Sep 17 00:00:00 2001
From: Levente Kurusa <levex@linux.com>
Date: Fri, 29 Nov 2013 21:28:48 +0100
Subject: x86, mce: Call put_device on device_register failure

This patch adds a call to put_device() when the device_register() call
has failed. This is required so that the last reference to the device is
given up.

Signed-off-by: Levente Kurusa <levex@linux.com>
Link: http://lkml.kernel.org/r/5298F900.9000208@linux.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b3218cd..a389c1d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
 	dev->release = &mce_device_release;
 
 	err = device_register(dev);
-	if (err)
+	if (err) {
+		put_device(dev);
 		return err;
+	}
 
 	for (i = 0; mce_device_attrs[i]; i++) {
 		err = device_create_file(dev, mce_device_attrs[i]);
-- 
cgit v1.1


From 191f57c137bcce0e3e9313acb77b2f114d15afbb Mon Sep 17 00:00:00 2001
From: Qiaowei Ren <qiaowei.ren@intel.com>
Date: Sat, 7 Dec 2013 08:20:57 +0800
Subject: x86, cpufeature: Define the Intel MPX feature flag

Define the Intel MPX (Memory Protection Extensions) CPU feature flag
in the cpufeature list.

Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Link: http://lkml.kernel.org/r/1386375658-2191-2-git-send-email-qiaowei.ren@intel.com
Signed-off-by: Xudong Hao <xudong.hao@intel.com>
Signed-off-by: Liu Jinsong <jinsong.liu@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 89270b43..e099f95 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -216,6 +216,7 @@
 #define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 #define X86_FEATURE_INVPCID	(9*32+10) /* Invalidate Processor Context ID */
 #define X86_FEATURE_RTM		(9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_MPX		(9*32+14) /* Memory Protection Extension */
 #define X86_FEATURE_RDSEED	(9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		(9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	(9*32+20) /* Supervisor Mode Access Prevention */
-- 
cgit v1.1


From e7d820a5e549b3eb6c3f9467507566565646a669 Mon Sep 17 00:00:00 2001
From: Qiaowei Ren <qiaowei.ren@intel.com>
Date: Thu, 5 Dec 2013 17:15:34 +0800
Subject: x86, xsave: Support eager-only xsave features, add MPX support

Some features, like Intel MPX, work only if the kernel uses eagerfpu
model.  So we should force eagerfpu on unless the user has explicitly
disabled it.

Add definitions for Intel MPX and add it to the supported list.

[ hpa: renamed XSTATE_FLEXIBLE to XSTATE_LAZY and added comments ]

Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Link: http://lkml.kernel.org/r/9E0BE1322F2F2246BD820DA9FC397ADE014A6115@SHSMSX102.ccr.corp.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h | 23 +++++++++++++++++++++++
 arch/x86/include/asm/xsave.h     | 14 ++++++++++----
 arch/x86/kernel/xsave.c          | 10 ++++++++++
 3 files changed, 43 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4..b7845a1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -370,6 +370,26 @@ struct ymmh_struct {
 	u32 ymmh_space[64];
 };
 
+struct lwp_struct {
+	u64 lwpcb_addr;
+	u32 flags;
+	u32 buf_head_offset;
+	u64 buf_base;
+	u32 buf_size;
+	u32 filters;
+	u64 saved_event_record[4];
+	u32 event_counter[16];
+};
+
+struct bndregs_struct {
+	u64 bndregs[8];
+} __packed;
+
+struct bndcsr_struct {
+	u64 cfg_reg_u;
+	u64 status_reg;
+} __packed;
+
 struct xsave_hdr_struct {
 	u64 xstate_bv;
 	u64 reserved1[2];
@@ -380,6 +400,9 @@ struct xsave_struct {
 	struct i387_fxsave_struct i387;
 	struct xsave_hdr_struct xsave_hdr;
 	struct ymmh_struct ymmh;
+	struct lwp_struct lwp;
+	struct bndregs_struct bndregs;
+	struct bndcsr_struct bndcsr;
 	/* new processor state extensions will go here */
 } __attribute__ ((packed, aligned (64)));
 
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 0415cda..5547389 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -9,6 +9,8 @@
 #define XSTATE_FP	0x1
 #define XSTATE_SSE	0x2
 #define XSTATE_YMM	0x4
+#define XSTATE_BNDREGS	0x8
+#define XSTATE_BNDCSR	0x10
 
 #define XSTATE_FPSSE	(XSTATE_FP | XSTATE_SSE)
 
@@ -20,10 +22,14 @@
 #define XSAVE_YMM_SIZE	    256
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
-/*
- * These are the features that the OS can handle currently.
- */
-#define XCNTXT_MASK	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
+/* Supported features which support lazy state saving */
+#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
+
+/* Supported features which require eager state saving */
+#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
+
+/* All currently supported features */
+#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
 
 #ifdef CONFIG_X86_64
 #define REX_PREFIX	"0x48, "
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 422fd82..a4b451c 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void)
 	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
 		eagerfpu = ENABLE;
 
+	if (pcntxt_mask & XSTATE_EAGER) {
+		if (eagerfpu == DISABLE) {
+			pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
+					pcntxt_mask & XSTATE_EAGER);
+			pcntxt_mask &= ~XSTATE_EAGER;
+		} else {
+			eagerfpu = ENABLE;
+		}
+	}
+
 	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
 		pcntxt_mask, xstate_size);
 }
-- 
cgit v1.1


From d8af4ce490e92adfa123ae79899332bd829903e7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 12 Dec 2013 14:46:42 +0100
Subject: x86/traps: Clean up error exception handler definitions

So I was reading the exception handler generation code and got a real
headache looking at the unstructured mess that our DO_ERROR*()
generation code is today.

Make it more readable.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/n/tip-kuabysiykvUJpgus35lhnhvs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b857ed8..57409f6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
 	exception_exit(prev_state);					\
 }
 
-DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
-		regs->ip)
-DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
-DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
-		regs->ip)
-DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
-		coprocessor_segment_overrun)
-DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR_INFO(X86_TRAP_DE,     SIGFPE,  "divide error",			divide_error,		     FPE_INTDIV, regs->ip )
+DO_ERROR     (X86_TRAP_OF,     SIGSEGV, "overflow",			overflow					  )
+DO_ERROR     (X86_TRAP_BR,     SIGSEGV, "bounds",			bounds						  )
+DO_ERROR_INFO(X86_TRAP_UD,     SIGILL,  "invalid opcode",		invalid_op,		     ILL_ILLOPN, regs->ip )
+DO_ERROR     (X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",	coprocessor_segment_overrun			  )
+DO_ERROR     (X86_TRAP_TS,     SIGSEGV, "invalid TSS",			invalid_TSS					  )
+DO_ERROR     (X86_TRAP_NP,     SIGBUS,  "segment not present",		segment_not_present				  )
 #ifdef CONFIG_X86_32
-DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
+DO_ERROR     (X86_TRAP_SS,     SIGBUS,  "stack segment",		stack_segment					  )
 #endif
-DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
-		BUS_ADRALN, 0)
+DO_ERROR_INFO(X86_TRAP_AC,     SIGBUS,  "alignment check",		alignment_check,	     BUS_ADRALN, 0	  )
 
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
-- 
cgit v1.1


From f09174c501f8bb259788cc36d5a7aa5b2831fb5e Mon Sep 17 00:00:00 2001
From: Qiaowei Ren <qiaowei.ren@intel.com>
Date: Sat, 14 Dec 2013 14:25:02 +0800
Subject: x86: add user_atomic_cmpxchg_inatomic at uaccess.h

This patch adds user_atomic_cmpxchg_inatomic() to use CMPXCHG
instruction against a user space address.

This generalizes the already existing futex_atomic_cmpxchg_inatomic()
so it can be used in other contexts.  This will be used in the
upcoming support for Intel MPX (Memory Protection Extensions.)

[ hpa: replaced #ifdef inside a macro with IS_ENABLED() ]

Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Link: http://lkml.kernel.org/r/1387002303-6620-1-git-send-email-qiaowei.ren@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/include/asm/uaccess.h | 92 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8ec57c0..48ff838 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -525,6 +525,98 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 unsigned long __must_check clear_user(void __user *mem, unsigned long len);
 unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
 
+extern void __cmpxchg_wrong_size(void)
+	__compiletime_error("Bad argument size for cmpxchg");
+
+#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size)	\
+({									\
+	int __ret = 0;							\
+	__typeof__(ptr) __uval = (uval);				\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	switch (size) {							\
+	case 1:								\
+	{								\
+		asm volatile("\t" ASM_STAC "\n"				\
+			"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n"		\
+			"2:\t" ASM_CLAC "\n"				\
+			"\t.section .fixup, \"ax\"\n"			\
+			"3:\tmov     %3, %0\n"				\
+			"\tjmp     2b\n"				\
+			"\t.previous\n"					\
+			_ASM_EXTABLE(1b, 3b)				\
+			: "+r" (__ret), "=a" (__old), "+m" (*(ptr))	\
+			: "i" (-EFAULT), "q" (__new), "1" (__old)	\
+			: "memory"					\
+		);							\
+		break;							\
+	}								\
+	case 2:								\
+	{								\
+		asm volatile("\t" ASM_STAC "\n"				\
+			"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n"		\
+			"2:\t" ASM_CLAC "\n"				\
+			"\t.section .fixup, \"ax\"\n"			\
+			"3:\tmov     %3, %0\n"				\
+			"\tjmp     2b\n"				\
+			"\t.previous\n"					\
+			_ASM_EXTABLE(1b, 3b)				\
+			: "+r" (__ret), "=a" (__old), "+m" (*(ptr))	\
+			: "i" (-EFAULT), "r" (__new), "1" (__old)	\
+			: "memory"					\
+		);							\
+		break;							\
+	}								\
+	case 4:								\
+	{								\
+		asm volatile("\t" ASM_STAC "\n"				\
+			"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"		\
+			"2:\t" ASM_CLAC "\n"				\
+			"\t.section .fixup, \"ax\"\n"			\
+			"3:\tmov     %3, %0\n"				\
+			"\tjmp     2b\n"				\
+			"\t.previous\n"					\
+			_ASM_EXTABLE(1b, 3b)				\
+			: "+r" (__ret), "=a" (__old), "+m" (*(ptr))	\
+			: "i" (-EFAULT), "r" (__new), "1" (__old)	\
+			: "memory"					\
+		);							\
+		break;							\
+	}								\
+	case 8:								\
+	{								\
+		if (!IS_ENABLED(CONFIG_X86_64))				\
+			__cmpxchg_wrong_size();				\
+									\
+		asm volatile("\t" ASM_STAC "\n"				\
+			"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n"		\
+			"2:\t" ASM_CLAC "\n"				\
+			"\t.section .fixup, \"ax\"\n"			\
+			"3:\tmov     %3, %0\n"				\
+			"\tjmp     2b\n"				\
+			"\t.previous\n"					\
+			_ASM_EXTABLE(1b, 3b)				\
+			: "+r" (__ret), "=a" (__old), "+m" (*(ptr))	\
+			: "i" (-EFAULT), "r" (__new), "1" (__old)	\
+			: "memory"					\
+		);							\
+		break;							\
+	}								\
+	default:							\
+		__cmpxchg_wrong_size();					\
+	}								\
+	*__uval = __old;						\
+	__ret;								\
+})
+
+#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new)		\
+({									\
+	access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ?		\
+		__user_atomic_cmpxchg_inatomic((uval), (ptr),		\
+				(old), (new), sizeof(*(ptr))) :		\
+		-EFAULT;						\
+})
+
 /*
  * movsl can be slow when source and dest are not both 8-byte aligned
  */
-- 
cgit v1.1


From 0ee3b6f87d4d748d5362cb47ff33fa1553805cb4 Mon Sep 17 00:00:00 2001
From: Qiaowei Ren <qiaowei.ren@intel.com>
Date: Sat, 14 Dec 2013 14:25:03 +0800
Subject: x86: replace futex_atomic_cmpxchg_inatomic() with
 user_atomic_cmpxchg_inatomic

futex_atomic_cmpxchg_inatomic() is simply the 32-bit implementation of
user_atomic_cmpxchg_inatomic(), which in turn is simply a
generalization of the original code in
futex_atomic_cmpxchg_inatomic().

Use the newly generalized user_atomic_cmpxchg_inatomic() as the futex
implementation, too.

[ hpa: retain the inline in futex.h rather than changing it to a macro ]

Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Link: http://lkml.kernel.org/r/1387002303-6620-2-git-send-email-qiaowei.ren@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/include/asm/futex.h | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index be27ba1..b4c1f54 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -110,26 +110,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 						u32 oldval, u32 newval)
 {
-	int ret = 0;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
-	asm volatile("\t" ASM_STAC "\n"
-		     "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
-		     "2:\t" ASM_CLAC "\n"
-		     "\t.section .fixup, \"ax\"\n"
-		     "3:\tmov     %3, %0\n"
-		     "\tjmp     2b\n"
-		     "\t.previous\n"
-		     _ASM_EXTABLE(1b, 3b)
-		     : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
-		     : "i" (-EFAULT), "r" (newval), "1" (oldval)
-		     : "memory"
-	);
-
-	*uval = oldval;
-	return ret;
+	return user_atomic_cmpxchg_inatomic(uval, uaddr, oldval, newval);
 }
 
 #endif
-- 
cgit v1.1


From 7fd565e27547c913b83b46d94662103be81a88ec Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 16 Dec 2013 21:20:23 +0100
Subject: perf/x86: enable Haswell Celeron RAPL support

Enable RAPL support for Haswell Celeron (model 69).

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: ak@linux.intel.com
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: zheng.z.yan@intel.com
Cc: bp@alien8.de
Cc: vincent.weaver@maine.edu
Cc: maria.n.dimakopoulou@gmail.com
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/1387225224-27799-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index bf8e4a7..0e3754e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -615,6 +615,7 @@ static int __init rapl_pmu_init(void)
 	case 42: /* Sandy Bridge */
 	case 58: /* Ivy Bridge */
 	case 60: /* Haswell */
+	case 69: /* Haswell-Celeron */
 		rapl_cntr_mask = RAPL_IDX_CLN;
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
-- 
cgit v1.1


From 7306006f103567fcf595d23dd741da0b8642f4c5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 18 Dec 2013 15:52:13 -0800
Subject: x86, realmode: Pointer walk cleanups, pull out invariant use of
 __pa()

The pointer arithmetic in this function was really bizarre, where in
fact all we really wanted was a simple pointer array walk.  Use the
much more idiomatic construction for that (*ptr++).

Factor an invariant use of __pa() out of the relocation loop.  At
least on 64 bits it seems gcc isn't capable of doing that
automatically.

Change the scope of a couple of variables to make it extra obvious
that they are extremely local temp variables.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-rd908t9c8kvcojdabtmm94mb@git.kernel.org
---
 arch/x86/realmode/init.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a44f457..bad628a 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -29,12 +29,10 @@ void __init reserve_real_mode(void)
 void __init setup_real_mode(void)
 {
 	u16 real_mode_seg;
-	u32 *rel;
+	const u32 *rel;
 	u32 count;
-	u32 *ptr;
-	u16 *seg;
-	int i;
 	unsigned char *base;
+	unsigned long phys_base;
 	struct trampoline_header *trampoline_header;
 	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
 #ifdef CONFIG_X86_64
@@ -46,23 +44,23 @@ void __init setup_real_mode(void)
 
 	memcpy(base, real_mode_blob, size);
 
-	real_mode_seg = __pa(base) >> 4;
+	phys_base = __pa(base);
+	real_mode_seg = phys_base >> 4;
+
 	rel = (u32 *) real_mode_relocs;
 
 	/* 16-bit segment relocations. */
-	count = rel[0];
-	rel = &rel[1];
-	for (i = 0; i < count; i++) {
-		seg = (u16 *) (base + rel[i]);
+	count = *rel++;
+	while (count--) {
+		u16 *seg = (u16 *) (base + *rel++);
 		*seg = real_mode_seg;
 	}
 
 	/* 32-bit linear relocations. */
-	count = rel[i];
-	rel =  &rel[i + 1];
-	for (i = 0; i < count; i++) {
-		ptr = (u32 *) (base + rel[i]);
-		*ptr += __pa(base);
+	count = *rel++;
+	while (count--) {
+		u32 *ptr = (u32 *) (base + *rel++);
+		*ptr += phys_base;
 	}
 
 	/* Must be perfomed *after* relocation. */
-- 
cgit v1.1


From f3d815cb854b2f6262ade56a4d91a1ed3f1e50c4 Mon Sep 17 00:00:00 2001
From: Lans Zhang <jia.zhang@windriver.com>
Date: Fri, 6 Dec 2013 12:18:30 +0800
Subject: x86/mm/numa: Fix 32-bit kernel NUMA boot

When booting a 32-bit x86 kernel on a NUMA machine, node data
cannot be allocated from local node if the account of memory for
node 0 covers the low memory space entirely:

  [    0.000000] Initmem setup node 0 [mem 0x00000000-0x83fffffff]
  [    0.000000]   NODE_DATA [mem 0x367ed000-0x367edfff]
  [    0.000000] Initmem setup node 1 [mem 0x840000000-0xfffffffff]
  [    0.000000] Cannot find 4096 bytes in node 1
  [    0.000000] 64664MB HIGHMEM available.
  [    0.000000] 871MB LOWMEM available.

To fix this issue, node data is allowed to be allocated from
other nodes if the memory of local node is still not mapped. The
expected result looks like this:

  [    0.000000] Initmem setup node 0 [mem 0x00000000-0x83fffffff]
  [    0.000000]   NODE_DATA [mem 0x367ed000-0x367edfff]
  [    0.000000] Initmem setup node 1 [mem 0x840000000-0xfffffffff]
  [    0.000000]   NODE_DATA [mem 0x367ec000-0x367ecfff]
  [    0.000000]     NODE_DATA(1) on node 0
  [    0.000000] 64664MB HIGHMEM available.
  [    0.000000] 871MB LOWMEM available.

Signed-off-by: Lans Zhang <jia.zhang@windriver.com>
Cc: <andi@firstfloor.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1386303510-18574-1-git-send-email-jia.zhang@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/numa.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58..c85da7b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -211,9 +211,13 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 	 */
 	nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
 	if (!nd_pa) {
-		pr_err("Cannot find %zu bytes in node %d\n",
-		       nd_size, nid);
-		return;
+		nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
+					      MEMBLOCK_ALLOC_ACCESSIBLE);
+		if (!nd_pa) {
+			pr_err("Cannot find %zu bytes in node %d\n",
+			       nd_size, nid);
+			return;
+		}
 	}
 	nd = __va(nd_pa);
 
-- 
cgit v1.1


From 16824255394f55adf31b9a96a9965d8c15bdac4c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 12 Dec 2013 15:08:36 +0100
Subject: x86, acpi, idle: Restructure the mwait idle routines

People seem to delight in writing wrong and broken mwait idle routines;
collapse the lot.

This leaves mwait_play_dead() the sole remaining user of __mwait() and
new __mwait() users are probably doing it wrong.

Also remove __sti_mwait() as its unused.

Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Jacob Jun Pan <jacob.jun.pan@linux.intel.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Len Brown <lenb@kernel.org>
Cc: Rui Zhang <rui.zhang@intel.com>
Acked-by: Rafael Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20131212141654.616820819@infradead.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/mwait.h     | 40 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h | 23 -----------------------
 arch/x86/kernel/acpi/cstate.c    | 23 -----------------------
 3 files changed, 40 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 2f366d0..361b02e 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_MWAIT_H
 #define _ASM_X86_MWAIT_H
 
+#include <linux/sched.h>
+
 #define MWAIT_SUBSTATE_MASK		0xf
 #define MWAIT_CSTATE_MASK		0xf
 #define MWAIT_SUBSTATE_SIZE		4
@@ -13,4 +15,42 @@
 
 #define MWAIT_ECX_INTERRUPT_BREAK	0x1
 
+static inline void __monitor(const void *eax, unsigned long ecx,
+			     unsigned long edx)
+{
+	/* "monitor %eax, %ecx, %edx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc8;"
+		     :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
+}
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+	if (!current_set_polling_and_test()) {
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+			clflush((void *)&current_thread_info()->flags);
+
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		if (!need_resched())
+			__mwait(eax, ecx);
+	}
+	__current_clr_polling();
+}
+
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4..24821f5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -700,29 +700,6 @@ static inline void sync_core(void)
 #endif
 }
 
-static inline void __monitor(const void *eax, unsigned long ecx,
-			     unsigned long edx)
-{
-	/* "monitor %eax, %ecx, %edx;" */
-	asm volatile(".byte 0x0f, 0x01, 0xc8;"
-		     :: "a" (eax), "c" (ecx), "d"(edx));
-}
-
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-	/* "mwait %eax, %ecx;" */
-	asm volatile(".byte 0x0f, 0x01, 0xc9;"
-		     :: "a" (eax), "c" (ecx));
-}
-
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
-{
-	trace_hardirqs_on();
-	/* "mwait %eax, %ecx;" */
-	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
-		     :: "a" (eax), "c" (ecx));
-}
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void init_amd_e400_c1e_mask(void);
 
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27..e69182f 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
 
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
-	if (!need_resched()) {
-		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
-			clflush((void *)&current_thread_info()->flags);
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		smp_mb();
-		if (!need_resched())
-			__mwait(ax, cx);
-	}
-}
-
 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 {
 	unsigned int cpu = smp_processor_id();
-- 
cgit v1.1


From 7e98b71920464b8d15fa95c74366416cd3c88861 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 19 Dec 2013 11:58:16 -0800
Subject: x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers

Use static_cpu_has() to conditionalize the CLFLUSH workaround, and add
memory barriers around it since the documentation is explicit that
CLFLUSH is only ordered with respect to MFENCE.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Len Brown <len.brown@intel.com>
Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com
---
 arch/x86/include/asm/mwait.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 361b02e..19b71c4 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -43,8 +43,11 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
 	if (!current_set_polling_and_test()) {
-		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+		if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+			mb();
 			clflush((void *)&current_thread_info()->flags);
+			mb();
+		}
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		if (!need_resched())
-- 
cgit v1.1


From 7d590cca7cd2cce4ed7c47d221d6f90566653ba8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 19 Dec 2013 12:30:03 -0800
Subject: x86, idle: Add memory barriers around clflush in mwait_play_dead()

For consistency with mwait_idle_with_hints().  Not sure they help, but
they really won't hurt...

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Len Brown <len.brown@intel.com>
Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com
---
 arch/x86/kernel/smpboot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a..f5252c4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void)
 		 * The WBINVD is insufficient due to the spurious-wakeup
 		 * case where we return around the loop.
 		 */
+		mb();
 		clflush(mwait_ptr);
+		mb();
 		__monitor(mwait_ptr, 0, 0);
 		mb();
 		__mwait(eax, 0);
-- 
cgit v1.1


From 19952a92037e752f9d3bbbad552d596f9a56e146 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 19 Dec 2013 11:35:58 -0800
Subject: stackprotector: Unify the HAVE_CC_STACKPROTECTOR logic between
 architectures

Instead of duplicating the CC_STACKPROTECTOR Kconfig and
Makefile logic in each architecture, switch to using
HAVE_CC_STACKPROTECTOR and keep everything in one place. This
retains the x86-specific bug verification scripts.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@linux-mips.org
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/1387481759-14535-2-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig  | 17 +----------------
 arch/x86/Makefile |  8 +++-----
 2 files changed, 4 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0952ecd..838e7c3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -125,6 +125,7 @@ config X86
 	select RTC_LIB
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
+	select HAVE_CC_STACKPROTECTOR
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -1617,22 +1618,6 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
-config CC_STACKPROTECTOR
-	bool "Enable -fstack-protector buffer overflow detection"
-	---help---
-	  This option turns on the -fstack-protector GCC feature. This
-	  feature puts, at the beginning of functions, a canary value on
-	  the stack just before the return address, and validates
-	  the value just before actually returning.  Stack based buffer
-	  overflows (that need to overwrite this return address) now also
-	  overwrite the canary, which gets detected and the attack is then
-	  neutralized via a kernel panic.
-
-	  This feature requires gcc version 4.2 or above, or a distribution
-	  gcc with the feature backported. Older versions are automatically
-	  detected and for those versions, this configuration option is
-	  ignored. (and a warning is printed during bootup)
-
 source kernel/Kconfig.hz
 
 config KEXEC
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 57d0215..13b22e0 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -89,13 +89,11 @@ else
         KBUILD_CFLAGS += -maccumulate-outgoing-args
 endif
 
+# Make sure compiler does not have buggy stack-protector support.
 ifdef CONFIG_CC_STACKPROTECTOR
 	cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
-        ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
-                stackp-y := -fstack-protector
-                KBUILD_CFLAGS += $(stackp-y)
-        else
-                $(warning stack protector enabled but no compiler support)
+        ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
+                $(warning stack-protector enabled but compiler support broken)
         endif
 endif
 
-- 
cgit v1.1


From b70fedc15892de8bc78f711d7821dd0916cc5508 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 16 Dec 2013 16:09:51 -0800
Subject: x86, x32: Use __kernel_long_t/__kernel_ulong_t in x86-64 stat.h

Both x32 and x86-64 use the same stat system call interface.  But x32
long is 32-bit.  This patch changes x86 uapi <asm/stat.h> to use
 __kernel_long_t/__kernel_ulong_t in x86-64 stat.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/CAMe9rOquPtWEro0GQ=Z95pZJ=c7GGkSHynjN4FbiB4p445x-Ng@mail.gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/uapi/asm/stat.h | 42 +++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/stat.h b/arch/x86/include/uapi/asm/stat.h
index 7b3ddc3..bc03eb5 100644
--- a/arch/x86/include/uapi/asm/stat.h
+++ b/arch/x86/include/uapi/asm/stat.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_STAT_H
 #define _ASM_X86_STAT_H
 
+#include <asm/posix_types.h>
+
 #define STAT_HAVE_NSEC 1
 
 #ifdef __i386__
@@ -78,26 +80,26 @@ struct stat64 {
 #else /* __i386__ */
 
 struct stat {
-	unsigned long	st_dev;
-	unsigned long	st_ino;
-	unsigned long	st_nlink;
-
-	unsigned int	st_mode;
-	unsigned int	st_uid;
-	unsigned int	st_gid;
-	unsigned int	__pad0;
-	unsigned long	st_rdev;
-	long		st_size;
-	long		st_blksize;
-	long		st_blocks;	/* Number 512-byte blocks allocated. */
-
-	unsigned long	st_atime;
-	unsigned long	st_atime_nsec;
-	unsigned long	st_mtime;
-	unsigned long	st_mtime_nsec;
-	unsigned long	st_ctime;
-	unsigned long   st_ctime_nsec;
-	long		__unused[3];
+	__kernel_ulong_t	st_dev;
+	__kernel_ulong_t	st_ino;
+	__kernel_ulong_t	st_nlink;
+
+	unsigned int		st_mode;
+	unsigned int		st_uid;
+	unsigned int		st_gid;
+	unsigned int		__pad0;
+	__kernel_ulong_t	st_rdev;
+	__kernel_long_t		st_size;
+	__kernel_long_t		st_blksize;
+	__kernel_long_t		st_blocks;	/* Number 512-byte blocks allocated. */
+
+	__kernel_ulong_t	st_atime;
+	__kernel_ulong_t	st_atime_nsec;
+	__kernel_ulong_t	st_mtime;
+	__kernel_ulong_t	st_mtime_nsec;
+	__kernel_ulong_t	st_ctime;
+	__kernel_ulong_t	st_ctime_nsec;
+	__kernel_long_t		__unused[3];
 };
 
 /* We don't need to memset the whole thing just to initialize the padding */
-- 
cgit v1.1


From addccbb264e5e0e5762f4893f6df24afad327c8c Mon Sep 17 00:00:00 2001
From: "Chen, Gong" <gong.chen@linux.intel.com>
Date: Mon, 25 Nov 2013 02:15:00 -0500
Subject: ACPI, APEI, GHES: Do not report only correctable errors with SCI

Currently SCI is employed to handle corrected errors - memory corrected
errors, more specifically but in fact SCI still can be used to handle
any errors, e.g. uncorrected or even fatal ones if enabled by the BIOS.
Enable logging for those kinds of errors too.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1385363701-12387-1-git-send-email-gong.chen@linux.intel.com
[ Boris: massage commit message, rename function arg. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index de8b60a..a1aef95 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,22 +33,28 @@
 #include <linux/acpi.h>
 #include <linux/cper.h>
 #include <acpi/apei.h>
+#include <acpi/ghes.h>
 #include <asm/mce.h>
 
 #include "mce-internal.h"
 
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 {
 	struct mce m;
 
-	/* Only corrected MC is reported */
-	if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
 		return;
 
 	mce_setup(&m);
 	m.bank = 1;
-	/* Fake a memory read corrected error with unknown channel */
+	/* Fake a memory read error with unknown channel */
 	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+
+	if (severity >= GHES_SEV_RECOVERABLE)
+		m.status |= MCI_STATUS_UC;
+	if (severity >= GHES_SEV_PANIC)
+		m.status |= MCI_STATUS_PCC;
+
 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
 	mce_notify_irq();
-- 
cgit v1.1


From 2da6e57cce14a1c3b0692d6f877b72e185110e2e Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:13 +0800
Subject: x86/efi: Remove unused variables in __map_region()

variables size and end is useless in this function, thus remove them.

Reported-by: Toshi Kani <toshi.kani@hp.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dave Young <dyoung@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi_64.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index bf286c3..c5a6491 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -148,15 +148,11 @@ void efi_setup_page_tables(void)
 static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
 	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-	unsigned long pf = 0, size;
-	u64 end;
+	unsigned long pf = 0;
 
 	if (!(md->attribute & EFI_MEMORY_WB))
 		pf |= _PAGE_PCD;
 
-	size = md->num_pages << PAGE_SHIFT;
-	end  = va + size;
-
 	if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
 		pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
 			   md->phys_addr, va);
-- 
cgit v1.1


From 3b2664964bc886ae9d5127c8d3708b1acc0626d2 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:14 +0800
Subject: x86/efi: Add a wrapper function efi_map_region_fixed()

Kexec kernel will use saved runtime virtual mapping, so add a new
function efi_map_region_fixed() for directly mapping a md to md->virt.

The md is passed in from 1st kernel, the virtual addr is saved in
md->virt_addr.

Signed-off-by: Dave Young <dyoung@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/include/asm/efi.h     |  1 +
 arch/x86/platform/efi/efi_32.c |  2 ++
 arch/x86/platform/efi/efi_64.c | 10 ++++++++++
 3 files changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 89a05b0..9fbaeb2 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -128,6 +128,7 @@ extern void efi_call_phys_epilog(void);
 extern void efi_unmap_memmap(void);
 extern void efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
+extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
 extern void efi_sync_low_kernel_mappings(void);
 extern void efi_setup_page_tables(void);
 extern void __init old_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index e94557c..7b3ec6e 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -47,6 +47,8 @@ void __init efi_map_region(efi_memory_desc_t *md)
 	old_map_region(md);
 }
 
+void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
+
 void efi_call_phys_prelog(void)
 {
 	struct desc_ptr gdt_descr;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index c5a6491..ff08cb1 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -199,6 +199,16 @@ void __init efi_map_region(efi_memory_desc_t *md)
 	md->virt_addr = efi_va;
 }
 
+/*
+ * kexec kernel will use efi_map_region_fixed to map efi runtime memory ranges.
+ * md->virt_addr is the original virtual address which had been mapped in kexec
+ * 1st kernel.
+ */
+void __init efi_map_region_fixed(efi_memory_desc_t *md)
+{
+	__map_region(md, md->virt_addr);
+}
+
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
 				 u32 type, u64 attribute)
 {
-- 
cgit v1.1


From a7f84f03f660d93574ac88835d056c0d6468aebe Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:15 +0800
Subject: x86/efi: Fix off-by-one bug in EFI Boot Services reservation

Current code check boot service region with kernel text region by:
start+size >= __pa_symbol(_text)
The end of the above region should be start + size - 1 instead.

I see this problem in ovmf + Fedora 19 grub boot:
text start: 1000000 md start: 800000 md size: 800000

Signed-off-by: Dave Young <dyoung@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Cc: stable@vger.kernel.org
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f8ec4da..15e3b5e 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -438,7 +438,7 @@ void __init efi_reserve_boot_services(void)
 		 * - Not within any part of the kernel
 		 * - Not the bios reserved area
 		*/
-		if ((start+size >= __pa_symbol(_text)
+		if ((start + size > __pa_symbol(_text)
 				&& start <= __pa_symbol(_end)) ||
 			!e820_all_mapped(start, start+size, E820_RAM) ||
 			memblock_is_region_reserved(start, size)) {
-- 
cgit v1.1


From 481f75c043cf44ec11c7fbdbbf37d43463f1e719 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:16 +0800
Subject: x86/efi: Cleanup efi_enter_virtual_mode() function

Add two small functions:
efi_merge_regions() and efi_map_regions(), efi_enter_virtual_mode()
calls them instead of embedding two long for loop.

Signed-off-by: Dave Young <dyoung@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c | 132 ++++++++++++++++++++++++++------------------
 1 file changed, 79 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 15e3b5e..4694632 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -773,44 +773,12 @@ void __init old_map_region(efi_memory_desc_t *md)
 		       (unsigned long long)md->phys_addr);
 }
 
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, we look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor into the
- * ->trampoline_pgd page table using a top-down VA allocation scheme.
- *
- * The old method which used to update that memory descriptor with the
- * virtual address obtained from ioremap() is still supported when the
- * kernel is booted with efi=old_map on its command line. Same old
- * method enabled the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- *
- * The new method does a pagetable switch in a preemption-safe manner
- * so that we're in a different address space when calling a runtime
- * function. For function arguments passing we do copy the PGDs of the
- * kernel page table into ->trampoline_pgd prior to each call.
- */
-void __init efi_enter_virtual_mode(void)
+/* Merge contiguous regions of the same type and attribute */
+static void __init efi_merge_regions(void)
 {
+	void *p;
 	efi_memory_desc_t *md, *prev_md = NULL;
-	void *p, *new_memmap = NULL;
-	unsigned long size;
-	efi_status_t status;
-	u64 end, systab;
-	int count = 0;
-
-	efi.systab = NULL;
 
-	/*
-	 * We don't do virtual mode, since we don't do runtime services, on
-	 * non-native EFI
-	 */
-	if (!efi_is_native()) {
-		efi_unmap_memmap();
-		return;
-	}
-
-	/* Merge contiguous regions of the same type and attribute */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		u64 prev_size;
 		md = p;
@@ -835,8 +803,31 @@ void __init efi_enter_virtual_mode(void)
 			continue;
 		}
 		prev_md = md;
+	}
+}
+
+static void __init get_systab_virt_addr(efi_memory_desc_t *md)
+{
+	unsigned long size;
+	u64 end, systab;
 
+	size = md->num_pages << EFI_PAGE_SHIFT;
+	end = md->phys_addr + size;
+	systab = (u64)(unsigned long)efi_phys.systab;
+	if (md->phys_addr <= systab && systab < end) {
+		systab += md->virt_addr - md->phys_addr;
+		efi.systab = (efi_system_table_t *)(unsigned long)systab;
 	}
+}
+
+/*
+ * Map efi memory ranges for runtime serivce and update new_memmap with virtual
+ * addresses.
+ */
+static void * __init efi_map_regions(int *count)
+{
+	efi_memory_desc_t *md;
+	void *p, *tmp, *new_memmap = NULL;
 
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
@@ -849,26 +840,64 @@ void __init efi_enter_virtual_mode(void)
 		}
 
 		efi_map_region(md);
+		get_systab_virt_addr(md);
+
+		tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size,
+			       GFP_KERNEL);
+		if (!tmp)
+			goto out_krealloc;
+		new_memmap = tmp;
+		memcpy(new_memmap + (*count * memmap.desc_size), md,
+		       memmap.desc_size);
+		(*count)++;
+	}
 
-		size = md->num_pages << EFI_PAGE_SHIFT;
-		end = md->phys_addr + size;
+	return new_memmap;
+out_krealloc:
+	kfree(new_memmap);
+	return NULL;
+}
+
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, we look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor into the
+ * ->trampoline_pgd page table using a top-down VA allocation scheme.
+ *
+ * The old method which used to update that memory descriptor with the
+ * virtual address obtained from ioremap() is still supported when the
+ * kernel is booted with efi=old_map on its command line. Same old
+ * method enabled the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ *
+ * The new method does a pagetable switch in a preemption-safe manner
+ * so that we're in a different address space when calling a runtime
+ * function. For function arguments passing we do copy the PGDs of the
+ * kernel page table into ->trampoline_pgd prior to each call.
+ */
+void __init efi_enter_virtual_mode(void)
+{
+	efi_status_t status;
+	void *new_memmap = NULL;
+	int count = 0;
 
-		systab = (u64) (unsigned long) efi_phys.systab;
-		if (md->phys_addr <= systab && systab < end) {
-			systab += md->virt_addr - md->phys_addr;
+	efi.systab = NULL;
 
-			efi.systab = (efi_system_table_t *) (unsigned long) systab;
-		}
+	/*
+	 * We don't do virtual mode, since we don't do runtime services, on
+	 * non-native EFI
+	 */
+	if (!efi_is_native()) {
+		efi_unmap_memmap();
+		return;
+	}
 
-		new_memmap = krealloc(new_memmap,
-				      (count + 1) * memmap.desc_size,
-				      GFP_KERNEL);
-		if (!new_memmap)
-			goto err_out;
+	efi_merge_regions();
 
-		memcpy(new_memmap + (count * memmap.desc_size), md,
-		       memmap.desc_size);
-		count++;
+	new_memmap = efi_map_regions(&count);
+	if (!new_memmap) {
+		pr_err("Error reallocating memory, EFI runtime non-functional!\n");
+		return;
 	}
 
 	BUG_ON(!efi.systab);
@@ -922,9 +951,6 @@ void __init efi_enter_virtual_mode(void)
 			 0, NULL);
 
 	return;
-
- err_out:
-	pr_err("Error reallocating memory, EFI runtime non-functional!\n");
 }
 
 /*
-- 
cgit v1.1


From a0998eb15afeffbf52a2c2829318f67df9ac57b8 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:17 +0800
Subject: efi: Export more EFI table variables to sysfs

Export fw_vendor, runtime and config table physical addresses to
/sys/firmware/efi/{fw_vendor,runtime,config_table} because kexec kernels
need them.

From EFI spec these 3 variables will be updated to virtual address after
entering virtual mode. But kernel startup code will need the physical
address.

Signed-off-by: Dave Young <dyoung@redhat.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 4694632..2859107 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -653,6 +653,10 @@ void __init efi_init(void)
 
 	set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
 
+	efi.config_table = (unsigned long)efi.systab->tables;
+	efi.fw_vendor	 = (unsigned long)efi.systab->fw_vendor;
+	efi.runtime	 = (unsigned long)efi.systab->runtime;
+
 	/*
 	 * Show what we know for posterity
 	 */
-- 
cgit v1.1


From 926172d46038d7610b6b8d84e40db727cefb482d Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:18 +0800
Subject: efi: Export EFI runtime memory mapping to sysfs

kexec kernel will need exactly same mapping for EFI runtime memory
ranges. Thus here export the runtime ranges mapping to sysfs,
kexec-tools will assemble them and pass to 2nd kernel via setup_data.

Introducing a new directory /sys/firmware/efi/runtime-map just like
/sys/firmware/memmap. Containing below attribute in each file of that
directory:

attribute  num_pages  phys_addr  type  virt_addr

Signed-off-by: Dave Young <dyoung@redhat.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c | 46 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 2859107..74fe7a7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -76,6 +76,9 @@ static __initdata efi_config_table_type_t arch_tables[] = {
 	{NULL_GUID, NULL, NULL},
 };
 
+static void *efi_runtime_map;
+static int nr_efi_runtime_map;
+
 /*
  * Returns 1 if 'facility' is enabled, 0 otherwise.
  */
@@ -824,6 +827,39 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 	}
 }
 
+static int __init save_runtime_map(void)
+{
+	efi_memory_desc_t *md;
+	void *tmp, *p, *q = NULL;
+	int count = 0;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+
+		if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
+		    (md->type == EFI_BOOT_SERVICES_CODE) ||
+		    (md->type == EFI_BOOT_SERVICES_DATA))
+			continue;
+		tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL);
+		if (!tmp)
+			goto out;
+		q = tmp;
+
+		memcpy(q + count * memmap.desc_size, md, memmap.desc_size);
+		count++;
+	}
+
+	efi_runtime_map = q;
+	nr_efi_runtime_map = count;
+	efi_runtime_map_setup(efi_runtime_map, nr_efi_runtime_map,
+			      boot_params.efi_info.efi_memdesc_size);
+
+	return 0;
+out:
+	kfree(q);
+	return -ENOMEM;
+}
+
 /*
  * Map efi memory ranges for runtime serivce and update new_memmap with virtual
  * addresses.
@@ -849,7 +885,7 @@ static void * __init efi_map_regions(int *count)
 		tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size,
 			       GFP_KERNEL);
 		if (!tmp)
-			goto out_krealloc;
+			goto out;
 		new_memmap = tmp;
 		memcpy(new_memmap + (*count * memmap.desc_size), md,
 		       memmap.desc_size);
@@ -857,7 +893,7 @@ static void * __init efi_map_regions(int *count)
 	}
 
 	return new_memmap;
-out_krealloc:
+out:
 	kfree(new_memmap);
 	return NULL;
 }
@@ -883,7 +919,7 @@ void __init efi_enter_virtual_mode(void)
 {
 	efi_status_t status;
 	void *new_memmap = NULL;
-	int count = 0;
+	int err, count = 0;
 
 	efi.systab = NULL;
 
@@ -904,6 +940,10 @@ void __init efi_enter_virtual_mode(void)
 		return;
 	}
 
+	err = save_runtime_map();
+	if (err)
+		pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
+
 	BUG_ON(!efi.systab);
 
 	efi_setup_page_tables();
-- 
cgit v1.1


From c5fe5d80680e2949ffe102180f5fc6cefc0d145f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 Dec 2013 15:30:58 -0800
Subject: x86: Replace assembly access_ok() with a C variant

It turns out that the assembly variant doesn't actually produce that
good code, presumably partly because it creates a long dependency
chain with no scheduling, and partly because we cannot get a flags
result out of gcc (which could be fixed with asm goto, but it turns
out not to be worth it.)

The C code allows gcc to schedule and generate multiple (easily
predictable) branches, and as a side benefit we can really optimize
the case where the size is constant.

Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/uaccess.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8ec57c0..84ecf1d 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -40,22 +40,28 @@
 /*
  * Test whether a block of memory is a valid user space address.
  * Returns 0 if the range is valid, nonzero otherwise.
- *
- * This is equivalent to the following test:
- * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
- *
- * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
  */
+static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
+{
+	/*
+	 * If we have used "sizeof()" for the size,
+	 * we know it won't overflow the limit (but
+	 * it might overflow the 'addr', so it's
+	 * important to subtract the size from the
+	 * limit, not add it to the address).
+	 */
+	if (__builtin_constant_p(size))
+		return addr > limit - size;
+
+	/* Arbitrary sizes? Be careful about overflow */
+	addr += size;
+	return (addr < size) || (addr > limit);
+}
 
 #define __range_not_ok(addr, size, limit)				\
 ({									\
-	unsigned long flag, roksum;					\
 	__chk_user_ptr(addr);						\
-	asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0"		\
-	    : "=&r" (flag), "=r" (roksum)				\
-	    : "1" (addr), "g" ((long)(size)),				\
-	      "rm" (limit));						\
-	flag;								\
+	__chk_range_not_ok((unsigned long __force)(addr), size, limit); \
 })
 
 /**
-- 
cgit v1.1


From a740576a4abf933de8f50787f24f24456cebd761 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 27 Dec 2013 16:52:47 -0800
Subject: x86: Slightly tweak the access_ok() C variant for better code

gcc can under very specific circumstances realize that the code
sequence:

	foo += bar;
	if (foo < bar) ...

... is equivalent to a carry out from the addition.  Tweak the
implementation of access_ok() (specifically __chk_range_not_ok()) to
make it more likely that gcc will make that connection.  It isn't
fool-proof (sometimes gcc seems to think it can make better code with
lea, and ends up with a second comparison), still, but it seems to be
able to connect the two more frequently this way.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/uaccess.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 84ecf1d..6f1bb74 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -41,7 +41,7 @@
  * Test whether a block of memory is a valid user space address.
  * Returns 0 if the range is valid, nonzero otherwise.
  */
-static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
+static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
 {
 	/*
 	 * If we have used "sizeof()" for the size,
@@ -55,7 +55,9 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns
 
 	/* Arbitrary sizes? Be careful about overflow */
 	addr += size;
-	return (addr < size) || (addr > limit);
+	if (addr < size)
+		return true;
+	return addr > limit;
 }
 
 #define __range_not_ok(addr, size, limit)				\
@@ -84,7 +86,7 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns
  * this function, memory access functions may still return -EFAULT.
  */
 #define access_ok(type, addr, size) \
-	(likely(__range_not_ok(addr, size, user_addr_max()) == 0))
+	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
  * The exception table consists of pairs of addresses relative to the
-- 
cgit v1.1


From 1fec0533693cd74f2d1a46edd29449cfee429df0 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:19 +0800
Subject: x86/efi: Pass necessary EFI data for kexec via setup_data

Add a new setup_data type SETUP_EFI for kexec use.  Passing the saved
fw_vendor, runtime, config tables and EFI runtime mappings.

When entering virtual mode, directly mapping the EFI runtime regions
which we passed in previously. And skip the step to call
SetVirtualAddressMap().

Specially for HP z420 workstation we need save the smbios physical
address.  The kernel boot sequence proceeds in the following order.
Step 2 requires efi.smbios to be the physical address.  However, I found
that on HP z420 EFI system table has a virtual address of SMBIOS in step
1.  Hence, we need set it back to the physical address with the smbios
in efi_setup_data.  (When it is still the physical address, it simply
sets the same value.)

1. efi_init() - Set efi.smbios from EFI system table
2. dmi_scan_machine() - Temporary map efi.smbios to access SMBIOS table
3. efi_enter_virtual_mode() - Map EFI ranges

Tested on ovmf+qemu, lenovo thinkpad, a dell laptop and an
HP z420 workstation.

Signed-off-by: Dave Young <dyoung@redhat.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/include/asm/efi.h            |  12 +++
 arch/x86/include/uapi/asm/bootparam.h |   1 +
 arch/x86/kernel/setup.c               |   3 +
 arch/x86/platform/efi/efi.c           | 158 ++++++++++++++++++++++++++++------
 arch/x86/platform/efi/efi_32.c        |   1 +
 arch/x86/platform/efi/efi_64.c        |   6 ++
 6 files changed, 156 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 9fbaeb2..4d1ba80 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -133,6 +133,18 @@ extern void efi_sync_low_kernel_mappings(void);
 extern void efi_setup_page_tables(void);
 extern void __init old_map_region(efi_memory_desc_t *md);
 
+struct efi_setup_data {
+	u64 fw_vendor;
+	u64 runtime;
+	u64 tables;
+	u64 smbios;
+	u64 reserved[8];
+};
+
+extern u64 efi_setup;
+extern u32 efi_data_len;
+extern void parse_efi_setup(u64 phys_addr, u32 data_len);
+
 #ifdef CONFIG_EFI
 
 static inline bool efi_is_native(void)
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 9c3733c..64fe421 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -6,6 +6,7 @@
 #define SETUP_E820_EXT			1
 #define SETUP_DTB			2
 #define SETUP_PCI			3
+#define SETUP_EFI			4
 
 /* ram_size flags */
 #define RAMDISK_IMAGE_START_MASK	0x07FF
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb233bc..24536f7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -447,6 +447,9 @@ static void __init parse_setup_data(void)
 		case SETUP_DTB:
 			add_dtb(pa_data);
 			break;
+		case SETUP_EFI:
+			parse_efi_setup(pa_data, data_len);
+			break;
 		default:
 			break;
 		}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 74fe7a7..9965ff4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -78,6 +78,8 @@ static __initdata efi_config_table_type_t arch_tables[] = {
 
 static void *efi_runtime_map;
 static int nr_efi_runtime_map;
+u64 efi_setup;		/* efi setup_data physical address */
+u32 efi_data_len;	/* efi setup_data payload length */
 
 /*
  * Returns 1 if 'facility' is enabled, 0 otherwise.
@@ -115,7 +117,6 @@ static int __init setup_storage_paranoia(char *arg)
 }
 early_param("efi_no_storage_paranoia", setup_storage_paranoia);
 
-
 static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
 {
 	unsigned long flags;
@@ -494,18 +495,27 @@ static int __init efi_systab_init(void *phys)
 {
 	if (efi_enabled(EFI_64BIT)) {
 		efi_system_table_64_t *systab64;
+		struct efi_setup_data *data = NULL;
 		u64 tmp = 0;
 
+		if (efi_setup) {
+			data = early_memremap(efi_setup, sizeof(*data));
+			if (!data)
+				return -ENOMEM;
+		}
 		systab64 = early_ioremap((unsigned long)phys,
 					 sizeof(*systab64));
 		if (systab64 == NULL) {
 			pr_err("Couldn't map the system table!\n");
+			if (data)
+				early_iounmap(data, sizeof(*data));
 			return -ENOMEM;
 		}
 
 		efi_systab.hdr = systab64->hdr;
-		efi_systab.fw_vendor = systab64->fw_vendor;
-		tmp |= systab64->fw_vendor;
+		efi_systab.fw_vendor = data ? (unsigned long)data->fw_vendor :
+					      systab64->fw_vendor;
+		tmp |= data ? data->fw_vendor : systab64->fw_vendor;
 		efi_systab.fw_revision = systab64->fw_revision;
 		efi_systab.con_in_handle = systab64->con_in_handle;
 		tmp |= systab64->con_in_handle;
@@ -519,15 +529,20 @@ static int __init efi_systab_init(void *phys)
 		tmp |= systab64->stderr_handle;
 		efi_systab.stderr = systab64->stderr;
 		tmp |= systab64->stderr;
-		efi_systab.runtime = (void *)(unsigned long)systab64->runtime;
-		tmp |= systab64->runtime;
+		efi_systab.runtime = data ?
+				     (void *)(unsigned long)data->runtime :
+				     (void *)(unsigned long)systab64->runtime;
+		tmp |= data ? data->runtime : systab64->runtime;
 		efi_systab.boottime = (void *)(unsigned long)systab64->boottime;
 		tmp |= systab64->boottime;
 		efi_systab.nr_tables = systab64->nr_tables;
-		efi_systab.tables = systab64->tables;
-		tmp |= systab64->tables;
+		efi_systab.tables = data ? (unsigned long)data->tables :
+					   systab64->tables;
+		tmp |= data ? data->tables : systab64->tables;
 
 		early_iounmap(systab64, sizeof(*systab64));
+		if (data)
+			early_iounmap(data, sizeof(*data));
 #ifdef CONFIG_X86_32
 		if (tmp >> 32) {
 			pr_err("EFI data located above 4GB, disabling EFI.\n");
@@ -631,6 +646,71 @@ static int __init efi_memmap_init(void)
 	return 0;
 }
 
+/*
+ * A number of config table entries get remapped to virtual addresses
+ * after entering EFI virtual mode. However, the kexec kernel requires
+ * their physical addresses therefore we pass them via setup_data and
+ * correct those entries to their respective physical addresses here.
+ *
+ * Currently only handles smbios which is necessary for some firmware
+ * implementation.
+ */
+static int __init efi_reuse_config(u64 tables, int nr_tables)
+{
+	int i, sz, ret = 0;
+	void *p, *tablep;
+	struct efi_setup_data *data;
+
+	if (!efi_setup)
+		return 0;
+
+	if (!efi_enabled(EFI_64BIT))
+		return 0;
+
+	data = early_memremap(efi_setup, sizeof(*data));
+	if (!data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (!data->smbios)
+		goto out_memremap;
+
+	sz = sizeof(efi_config_table_64_t);
+
+	p = tablep = early_memremap(tables, nr_tables * sz);
+	if (!p) {
+		pr_err("Could not map Configuration table!\n");
+		ret = -ENOMEM;
+		goto out_memremap;
+	}
+
+	for (i = 0; i < efi.systab->nr_tables; i++) {
+		efi_guid_t guid;
+
+		guid = ((efi_config_table_64_t *)p)->guid;
+
+		if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
+			((efi_config_table_64_t *)p)->table = data->smbios;
+		p += sz;
+	}
+	early_iounmap(tablep, nr_tables * sz);
+
+out_memremap:
+	early_iounmap(data, sizeof(*data));
+out:
+	return ret;
+}
+
+static void get_nr_runtime_map(void)
+{
+	if (!efi_setup)
+		return;
+
+	nr_efi_runtime_map = (efi_data_len - sizeof(struct efi_setup_data)) /
+			     sizeof(efi_memory_desc_t);
+}
+
 void __init efi_init(void)
 {
 	efi_char16_t *c16;
@@ -638,6 +718,7 @@ void __init efi_init(void)
 	int i = 0;
 	void *tmp;
 
+	get_nr_runtime_map();
 #ifdef CONFIG_X86_32
 	if (boot_params.efi_info.efi_systab_hi ||
 	    boot_params.efi_info.efi_memmap_hi) {
@@ -676,6 +757,9 @@ void __init efi_init(void)
 		efi.systab->hdr.revision >> 16,
 		efi.systab->hdr.revision & 0xffff, vendor);
 
+	if (efi_reuse_config(efi.systab->tables, efi.systab->nr_tables))
+		return;
+
 	if (efi_config_init(arch_tables))
 		return;
 
@@ -861,6 +945,23 @@ out:
 }
 
 /*
+ * Map efi regions which were passed via setup_data. The virt_addr is a fixed
+ * addr which was used in first kernel of a kexec boot.
+ */
+static void __init efi_map_regions_fixed(void)
+{
+	void *p;
+	efi_memory_desc_t *md;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		efi_map_region_fixed(md); /* FIXME: add error handling */
+		get_systab_virt_addr(md);
+	}
+
+}
+
+/*
  * Map efi memory ranges for runtime serivce and update new_memmap with virtual
  * addresses.
  */
@@ -914,6 +1015,10 @@ out:
  * so that we're in a different address space when calling a runtime
  * function. For function arguments passing we do copy the PGDs of the
  * kernel page table into ->trampoline_pgd prior to each call.
+ *
+ * Specially for kexec boot, efi runtime maps in previous kernel should
+ * be passed in via setup_data. In that case runtime ranges will be mapped
+ * to the same virtual addresses as the first kernel.
  */
 void __init efi_enter_virtual_mode(void)
 {
@@ -932,12 +1037,15 @@ void __init efi_enter_virtual_mode(void)
 		return;
 	}
 
-	efi_merge_regions();
-
-	new_memmap = efi_map_regions(&count);
-	if (!new_memmap) {
-		pr_err("Error reallocating memory, EFI runtime non-functional!\n");
-		return;
+	if (efi_setup) {
+		efi_map_regions_fixed();
+	} else {
+		efi_merge_regions();
+		new_memmap = efi_map_regions(&count);
+		if (!new_memmap) {
+			pr_err("Error reallocating memory, EFI runtime non-functional!\n");
+			return;
+		}
 	}
 
 	err = save_runtime_map();
@@ -949,16 +1057,18 @@ void __init efi_enter_virtual_mode(void)
 	efi_setup_page_tables();
 	efi_sync_low_kernel_mappings();
 
-	status = phys_efi_set_virtual_address_map(
-		memmap.desc_size * count,
-		memmap.desc_size,
-		memmap.desc_version,
-		(efi_memory_desc_t *)__pa(new_memmap));
-
-	if (status != EFI_SUCCESS) {
-		pr_alert("Unable to switch EFI into virtual mode "
-			 "(status=%lx)!\n", status);
-		panic("EFI call to SetVirtualAddressMap() failed!");
+	if (!efi_setup) {
+		status = phys_efi_set_virtual_address_map(
+			memmap.desc_size * count,
+			memmap.desc_size,
+			memmap.desc_version,
+			(efi_memory_desc_t *)__pa(new_memmap));
+
+		if (status != EFI_SUCCESS) {
+			pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n",
+				 status);
+			panic("EFI call to SetVirtualAddressMap() failed!");
+		}
 	}
 
 	/*
@@ -993,8 +1103,6 @@ void __init efi_enter_virtual_mode(void)
 			 EFI_VARIABLE_BOOTSERVICE_ACCESS |
 			 EFI_VARIABLE_RUNTIME_ACCESS,
 			 0, NULL);
-
-	return;
 }
 
 /*
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 7b3ec6e..249b183 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -48,6 +48,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
 }
 
 void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
+void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
 
 void efi_call_phys_prelog(void)
 {
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ff08cb1..324b651 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -228,3 +228,9 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
 
 	return (void __iomem *)__va(phys_addr);
 }
+
+void __init parse_efi_setup(u64 phys_addr, u32 data_len)
+{
+	efi_setup = phys_addr + sizeof(struct setup_data);
+	efi_data_len = data_len - sizeof(struct setup_data);
+}
-- 
cgit v1.1


From 456a29ddada79198c5965300e04103c40c481f62 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:20 +0800
Subject: x86: Add xloadflags bit for EFI runtime support on kexec

Old kexec-tools can not load new kernels. The reason is kexec-tools does
not fill efi_info in x86 setup header previously, thus EFI failed to
initialize.  In new kexec-tools it will by default to fill efi_info and
pass other EFI required infomation to 2nd kernel so kexec kernel EFI
initialization can succeed finally.

To prevent from breaking userspace, add a new xloadflags bit so
kexec-tools can check the flag and switch to old logic.

Signed-off-by: Dave Young <dyoung@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/boot/header.S                | 9 ++++++++-
 arch/x86/include/uapi/asm/bootparam.h | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9ec06a1..ec3b8ba 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -391,7 +391,14 @@ xloadflags:
 #else
 # define XLF23 0
 #endif
-			.word XLF0 | XLF1 | XLF23
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+# define XLF4 XLF_EFI_KEXEC
+#else
+# define XLF4 0
+#endif
+
+			.word XLF0 | XLF1 | XLF23 | XLF4
 
 cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
                                                 #added with boot protocol
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 64fe421..225b098 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -24,6 +24,7 @@
 #define XLF_CAN_BE_LOADED_ABOVE_4G	(1<<1)
 #define XLF_EFI_HANDOVER_32		(1<<2)
 #define XLF_EFI_HANDOVER_64		(1<<3)
+#define XLF_EFI_KEXEC			(1<<4)
 
 #ifndef __ASSEMBLY__
 
-- 
cgit v1.1


From 5039e316dde3fb71c79e95e97c5bca8e4724d8f2 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:21 +0800
Subject: x86: Export x86 boot_params to sysfs

kexec-tools use boot_params for getting the 1st kernel hardware_subarch,
the kexec kernel EFI runtime support also needs to read the old efi_info
from boot_params. Currently it exists in debugfs which is not a good
place for such infomation. Per HPA, we should avoid "sploit debugfs".

In this patch /sys/kernel/boot_params are exported, also the setup_data is
exported as a subdirectory. kexec-tools is using debugfs for hardware_subarch
for a long time now so we're not removing it yet.

Structure is like below:

/sys/kernel/boot_params
|__ data                /* boot_params in binary*/
|__ setup_data
|   |__ 0               /* the first setup_data node */
|   |   |__ data        /* setup_data node 0 in binary*/
|   |   |__ type        /* setup_data type of setup_data node 0, hex string */
[snip]
|__ version             /* boot protocal version (in hex, "0x" prefixed)*/

Signed-off-by: Dave Young <dyoung@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/kernel/Makefile |   1 +
 arch/x86/kernel/ksysfs.c | 339 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 340 insertions(+)
 create mode 100644 arch/x86/kernel/ksysfs.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9b0a34e..510cca5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-y			+= syscall_$(BITS).o
 obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
+obj-$(CONFIG_SYSFS)	+= ksysfs.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 0000000..eb53d15
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,339 @@
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ *      Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ *      Dave Young <dyoung@redhat.com>
+ *
+ * This file is released under the GPLv2
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/setup.h>
+
+static ssize_t version_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+				     struct bin_attribute *bin_attr,
+				     char *buf, loff_t off, size_t count)
+{
+	memcpy(buf, (void *)&boot_params + off, count);
+	return count;
+}
+
+static struct bin_attribute boot_params_data_attr = {
+	.attr = {
+		.name = "data",
+		.mode = S_IRUGO,
+	},
+	.read = boot_params_data_read,
+	.size = sizeof(boot_params),
+};
+
+static struct attribute *boot_params_version_attrs[] = {
+	&boot_params_version_attr.attr,
+	NULL,
+};
+
+static struct bin_attribute *boot_params_data_attrs[] = {
+	&boot_params_data_attr,
+	NULL,
+};
+
+static struct attribute_group boot_params_attr_group = {
+	.attrs = boot_params_version_attrs,
+	.bin_attrs = boot_params_data_attrs,
+};
+
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+	const char *name;
+
+	name = kobject_name(kobj);
+	return kstrtoint(name, 10, nr);
+}
+
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+	int i = 0;
+	struct setup_data *data;
+	u64 pa_data = boot_params.hdr.setup_data;
+
+	while (pa_data) {
+		if (nr == i) {
+			*paddr = pa_data;
+			return 0;
+		}
+		data = ioremap_cache(pa_data, sizeof(*data));
+		if (!data)
+			return -ENOMEM;
+
+		pa_data = data->next;
+		iounmap(data);
+		i++;
+	}
+	return -EINVAL;
+}
+
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+	int i = 0;
+	struct setup_data *data;
+	u64 pa_data = boot_params.hdr.setup_data;
+
+	while (pa_data) {
+		data = ioremap_cache(pa_data, sizeof(*data));
+		if (!data)
+			return -ENOMEM;
+		if (nr == i) {
+			*size = data->len;
+			iounmap(data);
+			return 0;
+		}
+
+		pa_data = data->next;
+		iounmap(data);
+		i++;
+	}
+	return -EINVAL;
+}
+
+static ssize_t type_show(struct kobject *kobj,
+			 struct kobj_attribute *attr, char *buf)
+{
+	int nr, ret;
+	u64 paddr;
+	struct setup_data *data;
+
+	ret = kobj_to_setup_data_nr(kobj, &nr);
+	if (ret)
+		return ret;
+
+	ret = get_setup_data_paddr(nr, &paddr);
+	if (ret)
+		return ret;
+	data = ioremap_cache(paddr, sizeof(*data));
+	if (!data)
+		return -ENOMEM;
+
+	ret = sprintf(buf, "0x%x\n", data->type);
+	iounmap(data);
+	return ret;
+}
+
+static ssize_t setup_data_data_read(struct file *fp,
+				    struct kobject *kobj,
+				    struct bin_attribute *bin_attr,
+				    char *buf,
+				    loff_t off, size_t count)
+{
+	int nr, ret = 0;
+	u64 paddr;
+	struct setup_data *data;
+	void *p;
+
+	ret = kobj_to_setup_data_nr(kobj, &nr);
+	if (ret)
+		return ret;
+
+	ret = get_setup_data_paddr(nr, &paddr);
+	if (ret)
+		return ret;
+	data = ioremap_cache(paddr, sizeof(*data));
+	if (!data)
+		return -ENOMEM;
+
+	if (off > data->len) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (count > data->len - off)
+		count = data->len - off;
+
+	if (!count)
+		goto out;
+
+	ret = count;
+	p = ioremap_cache(paddr + sizeof(*data), data->len);
+	if (!p) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	memcpy(buf, p + off, count);
+	iounmap(p);
+out:
+	iounmap(data);
+	return ret;
+}
+
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+
+static struct bin_attribute data_attr = {
+	.attr = {
+		.name = "data",
+		.mode = S_IRUGO,
+	},
+	.read = setup_data_data_read,
+};
+
+static struct attribute *setup_data_type_attrs[] = {
+	&type_attr.attr,
+	NULL,
+};
+
+static struct bin_attribute *setup_data_data_attrs[] = {
+	&data_attr,
+	NULL,
+};
+
+static struct attribute_group setup_data_attr_group = {
+	.attrs = setup_data_type_attrs,
+	.bin_attrs = setup_data_data_attrs,
+};
+
+static int __init create_setup_data_node(struct kobject *parent,
+					 struct kobject **kobjp, int nr)
+{
+	int ret = 0;
+	size_t size;
+	struct kobject *kobj;
+	char name[16]; /* should be enough for setup_data nodes numbers */
+	snprintf(name, 16, "%d", nr);
+
+	kobj = kobject_create_and_add(name, parent);
+	if (!kobj)
+		return -ENOMEM;
+
+	ret = get_setup_data_size(nr, &size);
+	if (ret)
+		goto out_kobj;
+
+	data_attr.size = size;
+	ret = sysfs_create_group(kobj, &setup_data_attr_group);
+	if (ret)
+		goto out_kobj;
+	*kobjp = kobj;
+
+	return 0;
+out_kobj:
+	kobject_put(kobj);
+	return ret;
+}
+
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+	sysfs_remove_group(kobj, &setup_data_attr_group);
+	kobject_put(kobj);
+}
+
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+	int ret = 0;
+	struct setup_data *data;
+
+	*nr = 0;
+	while (pa_data) {
+		*nr += 1;
+		data = ioremap_cache(pa_data, sizeof(*data));
+		if (!data) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		pa_data = data->next;
+		iounmap(data);
+	}
+
+out:
+	return ret;
+}
+
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+	struct kobject *setup_data_kobj, **kobjp;
+	u64 pa_data;
+	int i, j, nr, ret = 0;
+
+	pa_data = boot_params.hdr.setup_data;
+	if (!pa_data)
+		return 0;
+
+	setup_data_kobj = kobject_create_and_add("setup_data", parent);
+	if (!setup_data_kobj) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = get_setup_data_total_num(pa_data, &nr);
+	if (ret)
+		goto out_setup_data_kobj;
+
+	kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL);
+	if (!kobjp) {
+		ret = -ENOMEM;
+		goto out_setup_data_kobj;
+	}
+
+	for (i = 0; i < nr; i++) {
+		ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+		if (ret)
+			goto out_clean_nodes;
+	}
+
+	kfree(kobjp);
+	return 0;
+
+out_clean_nodes:
+	for (j = i - 1; j > 0; j--)
+		cleanup_setup_data_node(*(kobjp + j));
+	kfree(kobjp);
+out_setup_data_kobj:
+	kobject_put(setup_data_kobj);
+out:
+	return ret;
+}
+
+static int __init boot_params_ksysfs_init(void)
+{
+	int ret;
+	struct kobject *boot_params_kobj;
+
+	boot_params_kobj = kobject_create_and_add("boot_params",
+						  kernel_kobj);
+	if (!boot_params_kobj) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+	if (ret)
+		goto out_boot_params_kobj;
+
+	ret = create_setup_data_nodes(boot_params_kobj);
+	if (ret)
+		goto out_create_group;
+
+	return 0;
+out_create_group:
+	sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+	kobject_put(boot_params_kobj);
+out:
+	return ret;
+}
+
+arch_initcall(boot_params_ksysfs_init);
-- 
cgit v1.1


From 77ea8c948953a90401e436e7c05973b2d5529804 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Dec 2013 18:02:22 +0800
Subject: x86: Reserve setup_data ranges late after parsing memmap cmdline

Currently e820_reserve_setup_data() is called before parsing early
params, it works in normal case. But for memmap=exactmap, the final
memory ranges are created after parsing memmap= cmdline params, so the
previous e820_reserve_setup_data() has no effect. For example,
setup_data ranges will still be marked as normal system ram, thus when
later sysfs driver ioremap them kernel will warn about mapping normal
ram.

This patch fix it by moving the e820_reserve_setup_data() callback after
parsing early params so they can be set as reserved ranges and later
ioremap will be fine with it.

Signed-off-by: Dave Young <dyoung@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 24536f7..be4b456 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -927,8 +927,6 @@ void __init setup_arch(char **cmdline_p)
 	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
 	setup_memory_map();
 	parse_setup_data();
-	/* update the e820_saved too */
-	e820_reserve_setup_data();
 
 	copy_edd();
 
@@ -990,6 +988,8 @@ void __init setup_arch(char **cmdline_p)
 		early_dump_pci_devices();
 #endif
 
+	/* update the e820_saved too */
+	e820_reserve_setup_data();
 	finish_e820_parsing();
 
 	if (efi_enabled(EFI_BOOT))
-- 
cgit v1.1


From 518548abd61808ea1e31614ccbdae34d3c32dfa4 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Sat, 21 Dec 2013 16:09:46 +0000
Subject: x86/efi: Delete superfluous global variables

There's no need to save the runtime map details in global variables, the
values are only required to pass to efi_runtime_map_setup().

And because 'nr_efi_runtime_map' isn't needed, get_nr_runtime_map() can
be deleted along with 'efi_data_len'.

Cc: Dave Young <dyoung@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c    | 18 +-----------------
 arch/x86/platform/efi/efi_64.c |  1 -
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 9965ff4..7ed3ecf 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -76,10 +76,7 @@ static __initdata efi_config_table_type_t arch_tables[] = {
 	{NULL_GUID, NULL, NULL},
 };
 
-static void *efi_runtime_map;
-static int nr_efi_runtime_map;
 u64 efi_setup;		/* efi setup_data physical address */
-u32 efi_data_len;	/* efi setup_data payload length */
 
 /*
  * Returns 1 if 'facility' is enabled, 0 otherwise.
@@ -702,15 +699,6 @@ out:
 	return ret;
 }
 
-static void get_nr_runtime_map(void)
-{
-	if (!efi_setup)
-		return;
-
-	nr_efi_runtime_map = (efi_data_len - sizeof(struct efi_setup_data)) /
-			     sizeof(efi_memory_desc_t);
-}
-
 void __init efi_init(void)
 {
 	efi_char16_t *c16;
@@ -718,7 +706,6 @@ void __init efi_init(void)
 	int i = 0;
 	void *tmp;
 
-	get_nr_runtime_map();
 #ifdef CONFIG_X86_32
 	if (boot_params.efi_info.efi_systab_hi ||
 	    boot_params.efi_info.efi_memmap_hi) {
@@ -933,10 +920,7 @@ static int __init save_runtime_map(void)
 		count++;
 	}
 
-	efi_runtime_map = q;
-	nr_efi_runtime_map = count;
-	efi_runtime_map_setup(efi_runtime_map, nr_efi_runtime_map,
-			      boot_params.efi_info.efi_memdesc_size);
+	efi_runtime_map_setup(q, count, memmap.desc_size);
 
 	return 0;
 out:
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 324b651..6284f15 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -232,5 +232,4 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
 void __init parse_efi_setup(u64 phys_addr, u32 data_len)
 {
 	efi_setup = phys_addr + sizeof(struct setup_data);
-	efi_data_len = data_len - sizeof(struct setup_data);
 }
-- 
cgit v1.1


From 41a34cec2e0df7798ea322ed1480bc3d3facdc8e Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 3 Jan 2014 11:54:31 +0800
Subject: x86: ksysfs.c build fix

kbuild test robot report below error for randconfig:

  arch/x86/kernel/ksysfs.c: In function 'get_setup_data_paddr':
  arch/x86/kernel/ksysfs.c:81:3: error: implicit declaration of function 'ioremap_cache' [-Werror=implicit-function-declaration]
  arch/x86/kernel/ksysfs.c:86:3: error: implicit declaration of function 'iounmap' [-Werror=implicit-function-declaration]

Fix it by including <asm/io.h> in ksysfs.c

Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/kernel/ksysfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index eb53d15..c2bedae 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 
+#include <asm/io.h>
 #include <asm/setup.h>
 
 static ssize_t version_show(struct kobject *kobj,
-- 
cgit v1.1


From 5c12af0c41e3417e1939095325920463b5f8e726 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 3 Jan 2014 11:56:49 +0800
Subject: x86/efi: parse_efi_setup() build fix

In case without CONFIG_EFI, there will be below build error:

   arch/x86/built-in.o: In function `setup_arch':
  (.init.text+0x9dc): undefined reference to `parse_efi_setup'

Thus fix it by adding blank inline function in asm/efi.h
Also remove an unused declaration for variable efi_data_len.

Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/include/asm/efi.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 4d1ba80..3b978c4 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -142,8 +142,6 @@ struct efi_setup_data {
 };
 
 extern u64 efi_setup;
-extern u32 efi_data_len;
-extern void parse_efi_setup(u64 phys_addr, u32 data_len);
 
 #ifdef CONFIG_EFI
 
@@ -153,7 +151,7 @@ static inline bool efi_is_native(void)
 }
 
 extern struct console early_efi_console;
-
+extern void parse_efi_setup(u64 phys_addr, u32 data_len);
 #else
 /*
  * IF EFI is not configured, have the EFI calls return -ENOSYS.
@@ -165,6 +163,7 @@ extern struct console early_efi_console;
 #define efi_call4(_f, _a1, _a2, _a3, _a4)		(-ENOSYS)
 #define efi_call5(_f, _a1, _a2, _a3, _a4, _a5)		(-ENOSYS)
 #define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6)	(-ENOSYS)
+static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
 #endif /* CONFIG_EFI */
 
 #endif /* _ASM_X86_EFI_H */
-- 
cgit v1.1


From dd360393f4d948eb518372316e52101cf3b44212 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 23 Dec 2013 14:16:58 +0200
Subject: x86, cpu: Detect more TLB configuration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Intel Software Developer’s Manual covers few more TLB
configurations exposed as CPUID 2 descriptors:

61H Instruction TLB: 4 KByte pages, fully associative, 48 entries
63H Data TLB: 1 GByte pages, 4-way set associative, 4 entries
76H Instruction TLB: 2M/4M pages, fully associative, 8 entries
B5H Instruction TLB: 4KByte pages, 8-way set associative, 64 entries
B6H Instruction TLB: 4KByte pages, 8-way set associative, 128 entries
C1H Shared 2nd-Level TLB: 4 KByte/2MByte pages, 8-way associative, 1024 entries
C2H DTLB DTLB: 2 MByte/$MByte pages, 4-way associative, 16 entries

Let's detect them as well.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Link: http://lkml.kernel.org/r/1387801018-14499-1-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/common.c     |  7 ++++---
 arch/x86/kernel/cpu/intel.c      | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4..1dd6260 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -72,6 +72,7 @@ extern u16 __read_mostly tlb_lli_4m[NR_INFO];
 extern u16 __read_mostly tlb_lld_4k[NR_INFO];
 extern u16 __read_mostly tlb_lld_2m[NR_INFO];
 extern u16 __read_mostly tlb_lld_4m[NR_INFO];
+extern u16 __read_mostly tlb_lld_1g[NR_INFO];
 extern s8  __read_mostly tlb_flushall_shift;
 
 /*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6abc172..24b6fd1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -472,6 +472,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];
 u16 __read_mostly tlb_lld_4k[NR_INFO];
 u16 __read_mostly tlb_lld_2m[NR_INFO];
 u16 __read_mostly tlb_lld_4m[NR_INFO];
+u16 __read_mostly tlb_lld_1g[NR_INFO];
 
 /*
  * tlb_flushall_shift shows the balance point in replacing cr3 write
@@ -486,13 +487,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)
 	if (this_cpu->c_detect_tlb)
 		this_cpu->c_detect_tlb(c);
 
-	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
-		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"	     \
+	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
+		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
 		"tlb_flushall_shift: %d\n",
 		tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
 		tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
 		tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
-		tlb_flushall_shift);
+		tlb_lld_1g[ENTRIES], tlb_flushall_shift);
 }
 
 void detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ea04b34..5eb7ea5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -506,6 +506,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 #define TLB_DATA0_2M_4M	0x23
 
 #define STLB_4K		0x41
+#define STLB_4K_2M	0x42
 
 static const struct _tlb_table intel_tlb_table[] = {
 	{ 0x01, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages, 4-way set associative" },
@@ -526,13 +527,20 @@ static const struct _tlb_table intel_tlb_table[] = {
 	{ 0x5b, TLB_DATA_4K_4M,		64,	" TLB_DATA 4 KByte and 4 MByte pages" },
 	{ 0x5c, TLB_DATA_4K_4M,		128,	" TLB_DATA 4 KByte and 4 MByte pages" },
 	{ 0x5d, TLB_DATA_4K_4M,		256,	" TLB_DATA 4 KByte and 4 MByte pages" },
+	{ 0x61, TLB_INST_4K,		48,	" TLB_INST 4 KByte pages, full associative" },
+	{ 0x63, TLB_DATA_1G,		4,	" TLB_DATA 1 GByte pages, 4-way set associative" },
+	{ 0x76, TLB_INST_2M_4M,		8,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" },
 	{ 0xb0, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 4-way set associative" },
 	{ 0xb1, TLB_INST_2M_4M,		4,	" TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
 	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },
 	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },
 	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" },
+	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
+	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
 	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },
 	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
+	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" },
+	{ 0xc2, TLB_DATA_2M_4M,		16,	" DTLB 2 MByte/4MByte pages, 4-way associative" },
 	{ 0xca, STLB_4K,		512,	" STLB 4 KByte pages, 4-way associative" },
 	{ 0x00, 0, 0 }
 };
@@ -558,6 +566,20 @@ static void intel_tlb_lookup(const unsigned char desc)
 		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
 			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
 		break;
+	case STLB_4K_2M:
+		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+		break;
 	case TLB_INST_ALL:
 		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
 			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
@@ -603,6 +625,10 @@ static void intel_tlb_lookup(const unsigned char desc)
 		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
 			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
 		break;
+	case TLB_DATA_1G:
+		if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+		break;
 	}
 }
 
-- 
cgit v1.1


From df90ca969035d3f6c95044e272f75bf417b14245 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 3 Jan 2014 16:45:00 -0500
Subject: x86, sparse: Do not force removal of __user when calling
 copy_to/from_user_nocheck()

Commit ff47ab4ff3cdd "x86: Add 1/2/4/8 byte optimization to 64bit
__copy_{from,to}_user_inatomic" added a "_nocheck" call in between
the copy_to/from_user() and copy_user_generic(). As both the
normal and nocheck versions of theses calls use the proper __user
annotation, a typecast to remove it should not be added.
This causes sparse to spin out the following warnings:

arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces)
arch/x86/include/asm/uaccess_64.h:207:47:    expected void const [noderef] <asn:1>*src
arch/x86/include/asm/uaccess_64.h:207:47:    got void const *<noident>
arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces)
arch/x86/include/asm/uaccess_64.h:207:47:    expected void const [noderef] <asn:1>*src
arch/x86/include/asm/uaccess_64.h:207:47:    got void const *<noident>
arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces)
arch/x86/include/asm/uaccess_64.h:207:47:    expected void const [noderef] <asn:1>*src
arch/x86/include/asm/uaccess_64.h:207:47:    got void const *<noident>
arch/x86/include/asm/uaccess_64.h:207:47: warning: incorrect type in argument 2 (different address spaces)
arch/x86/include/asm/uaccess_64.h:207:47:    expected void const [noderef] <asn:1>*src
arch/x86/include/asm/uaccess_64.h:207:47:    got void const *<noident>

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20140103164500.5f6478f5@gandalf.local.home
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/uaccess_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 190413d..12a26b9 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -204,13 +204,13 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 static __must_check __always_inline int
 __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
 {
-	return __copy_from_user_nocheck(dst, (__force const void *)src, size);
+	return __copy_from_user_nocheck(dst, src, size);
 }
 
 static __must_check __always_inline int
 __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 {
-	return __copy_to_user_nocheck((__force void *)dst, src, size);
+	return __copy_to_user_nocheck(dst, src, size);
 }
 
 extern long __copy_user_nocache(void *dst, const void __user *src,
-- 
cgit v1.1


From e24df921af484e9b7095ee06d236dabdb3bd9c1c Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 3 Jan 2014 22:30:42 +0000
Subject: x86, boot: Use .code16 instead of .code16gcc

This reverts commit 28b48688 ("x86, boot: use .code16gcc instead
of .code16").

Versions of binutils older than 2.16 are already not working, so this
workaround is no longer necessary either.  At the same time, some of
the transformations that .code16gcc does can be *extremely*
counterintuitive to a human programmer.

[ hpa: folded ret -> retl and call -> calll fixes from followup patch ]

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Link: http://lkml.kernel.org/r/1388788242.2391.75.camel@shinybook.infradead.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/bioscall.S |  2 +-
 arch/x86/boot/copy.S     | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 1dfbf64..5077937 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -13,7 +13,7 @@
  * touching registers they shouldn't be.
  */
 
-	.code16gcc
+	.code16
 	.text
 	.globl	intcall
 	.type	intcall, @function
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 11f272c..1eb7d29 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -14,7 +14,7 @@
  * Memory copy routines
  */
 
-	.code16gcc
+	.code16
 	.text
 
 GLOBAL(memcpy)
@@ -30,7 +30,7 @@ GLOBAL(memcpy)
 	rep; movsb
 	popw	%di
 	popw	%si
-	ret
+	retl
 ENDPROC(memcpy)
 
 GLOBAL(memset)
@@ -45,25 +45,25 @@ GLOBAL(memset)
 	andw	$3, %cx
 	rep; stosb
 	popw	%di
-	ret
+	retl
 ENDPROC(memset)
 
 GLOBAL(copy_from_fs)
 	pushw	%ds
 	pushw	%fs
 	popw	%ds
-	call	memcpy
+	calll	memcpy
 	popw	%ds
-	ret
+	retl
 ENDPROC(copy_from_fs)
 
 GLOBAL(copy_to_fs)
 	pushw	%es
 	pushw	%fs
 	popw	%es
-	call	memcpy
+	calll	memcpy
 	popw	%es
-	ret
+	retl
 ENDPROC(copy_to_fs)
 
 #if 0 /* Not currently used, but can be enabled as needed */
@@ -71,17 +71,17 @@ GLOBAL(copy_from_gs)
 	pushw	%ds
 	pushw	%gs
 	popw	%ds
-	call	memcpy
+	calll	memcpy
 	popw	%ds
-	ret
+	retl
 ENDPROC(copy_from_gs)
 
 GLOBAL(copy_to_gs)
 	pushw	%es
 	pushw	%gs
 	popw	%es
-	call	memcpy
+	calll	memcpy
 	popw	%es
-	ret
+	retl
 ENDPROC(copy_to_gs)
 #endif
-- 
cgit v1.1


From 6f34152f54bcf63a49ea701643c81f5b2168fec8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Sat, 4 Jan 2014 14:29:08 -0800
Subject: x86, boot: Move intcall() to the .inittext section

The .inittext section tries to aggregate all functions which are
needed to get a message out in the case of a load failure.  However,
putchar() uses intcall(), so intcall() should be in the .inittext
section.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-twxm8igouzbmsklmf6lfyq0w@git.kernel.org
---
 arch/x86/boot/bioscall.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 5077937..d401b4a 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -1,6 +1,6 @@
 /* -----------------------------------------------------------------------
  *
- *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *   Copyright 2009-2014 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2 or (at your
@@ -14,7 +14,7 @@
  */
 
 	.code16
-	.text
+	.section ".inittext","ax"
 	.globl	intcall
 	.type	intcall, @function
 intcall:
-- 
cgit v1.1


From 663b55b9b39fa9c848cca273ca4e12bf29b32c71 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 6 Jan 2014 19:20:26 -0500
Subject: x86: Delete non-required instances of include <linux/init.h>

None of these files are actually using any __init type directives
and hence don't need to include <linux/init.h>.  Most are just a
left over from __devinit and __cpuinit removal, or simply due to
code getting copied from one driver to the next.

[ hpa: undid incorrect removal from arch/x86/kernel/head_32.S ]

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Link: http://lkml.kernel.org/r/1389054026-12947-1-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h                           | 1 -
 arch/x86/include/asm/mpspec.h                        | 1 -
 arch/x86/include/asm/processor.h                     | 1 -
 arch/x86/include/asm/ptrace.h                        | 1 -
 arch/x86/include/asm/smp.h                           | 1 -
 arch/x86/include/asm/timer.h                         | 1 -
 arch/x86/kernel/apic/apic_flat_64.c                  | 1 -
 arch/x86/kernel/apic/apic_noop.c                     | 1 -
 arch/x86/kernel/apic/ipi.c                           | 1 -
 arch/x86/kernel/apic/summit_32.c                     | 1 -
 arch/x86/kernel/apic/x2apic_cluster.c                | 1 -
 arch/x86/kernel/apic/x2apic_phys.c                   | 1 -
 arch/x86/kernel/cpu/amd.c                            | 1 -
 arch/x86/kernel/cpu/centaur.c                        | 1 -
 arch/x86/kernel/cpu/cyrix.c                          | 1 -
 arch/x86/kernel/cpu/intel.c                          | 1 -
 arch/x86/kernel/cpu/mcheck/mce_intel.c               | 1 -
 arch/x86/kernel/cpu/mcheck/p5.c                      | 1 -
 arch/x86/kernel/cpu/mcheck/winchip.c                 | 1 -
 arch/x86/kernel/cpu/transmeta.c                      | 1 -
 arch/x86/kernel/cpu/umc.c                            | 1 -
 arch/x86/kernel/crash.c                              | 1 -
 arch/x86/kernel/doublefault.c                        | 1 -
 arch/x86/kernel/hw_breakpoint.c                      | 1 -
 arch/x86/kernel/kgdb.c                               | 1 -
 arch/x86/kernel/machine_kexec_32.c                   | 1 -
 arch/x86/kernel/pci-nommu.c                          | 1 -
 arch/x86/kernel/process_32.c                         | 1 -
 arch/x86/kernel/tsc_sync.c                           | 1 -
 arch/x86/lib/delay.c                                 | 1 -
 arch/x86/mm/kmmio.c                                  | 1 -
 arch/x86/mm/pageattr-test.c                          | 1 -
 arch/x86/pci/fixup.c                                 | 1 -
 arch/x86/platform/intel-mid/early_printk_intel_mid.c | 1 -
 arch/x86/platform/iris/iris.c                        | 1 -
 arch/x86/realmode/rm/reboot.S                        | 1 -
 arch/x86/realmode/rm/trampoline_32.S                 | 1 -
 arch/x86/realmode/rm/trampoline_64.S                 | 1 -
 arch/x86/um/vdso/vdso.S                              | 1 -
 arch/x86/vdso/vdso.S                                 | 1 -
 arch/x86/vdso/vdsox32.S                              | 1 -
 41 files changed, 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c696a86..6e4ce2d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -118,7 +118,6 @@ extern void mce_register_decode_chain(struct notifier_block *nb);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
 
 #include <linux/percpu.h>
-#include <linux/init.h>
 #include <linux/atomic.h>
 
 extern int mce_p5_enabled;
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3142a94..3e6b492 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_MPSPEC_H
 #define _ASM_X86_MPSPEC_H
 
-#include <linux/init.h>
 
 #include <asm/mpspec_def.h>
 #include <asm/x86_init.h>
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4..8ade617 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,7 +27,6 @@ struct mm_struct;
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <linux/math64.h>
-#include <linux/init.h>
 #include <linux/err.h>
 #include <linux/irqflags.h>
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 942a086..14fd6fd 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -60,7 +60,6 @@ struct pt_regs {
 
 #endif /* !__i386__ */
 
-#include <linux/init.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt_types.h>
 #endif
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4137890..8cd27e0 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -2,7 +2,6 @@
 #define _ASM_X86_SMP_H
 #ifndef __ASSEMBLY__
 #include <linux/cpumask.h>
-#include <linux/init.h>
 #include <asm/percpu.h>
 
 /*
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 34baa0e..a6f3e77 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -1,6 +1,5 @@
 #ifndef _ASM_X86_TIMER_H
 #define _ASM_X86_TIMER_H
-#include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/percpu.h>
 #include <linux/interrupt.h>
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 00c77cf..5d5b9eb 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -14,7 +14,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <asm/smp.h>
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e145f28..191ce75 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -15,7 +15,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 7434d85..6207156 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,5 @@
 #include <linux/cpumask.h>
 #include <linux/interrupt.h>
-#include <linux/init.h>
 
 #include <linux/mm.h>
 #include <linux/delay.h>
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 77c95c0..00146f9 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -29,7 +29,6 @@
 #define pr_fmt(fmt) "summit: %s: " fmt, __func__
 
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <asm/io.h>
 #include <asm/bios_ebda.h>
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 140e29d..cac85ee 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -3,7 +3,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/dmar.h>
 #include <linux/cpu.h>
 
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 562a76d..de231e3 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -3,7 +3,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023b..39bc78d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,5 +1,4 @@
 #include <linux/export.h>
-#include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/elf.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 8d5652d..8779eda 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,6 +1,5 @@
 #include <linux/bitops.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 
 #include <asm/processor.h>
 #include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d0969c7..aaf152e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
 #include <linux/pci.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index dc1ec0d..53f5d3c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/kernel.h>
 
 #include <linux/string.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 4cfe045..fb6156f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -6,7 +6,6 @@
  */
 
 #include <linux/gfp.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 1c044b1..a304298 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -5,7 +5,6 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index e9a701a..7dc5564 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -5,7 +5,6 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/init.h>
 
 #include <asm/processor.h>
 #include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index aa0430d..3fa0e5a 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,5 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include "cpu.h"
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index 75c5ad5..ef9c2a0 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,4 @@
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <asm/processor.h>
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 18677a9..a57902e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -7,7 +7,6 @@
  *
  */
 
-#include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/smp.h>
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 5d3fe8d..f6dfd93 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,6 +1,5 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
-#include <linux/init.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
 
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index f66ff16..a67b47c 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -38,7 +38,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/hw_breakpoint.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f832..7ec1d5f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -39,7 +39,6 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/kgdb.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/nmi.h>
 #include <linux/hw_breakpoint.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 5b19e4d..1667b1d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -9,7 +9,6 @@
 #include <linux/mm.h>
 #include <linux/kexec.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 871be4a..da15918 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -3,7 +3,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
 #include <linux/string.h>
-#include <linux/init.h>
 #include <linux/gfp.h>
 #include <linux/pci.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6f1236c..0de43e9 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -24,7 +24,6 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/reboot.h>
-#include <linux/init.h>
 #include <linux/mc146818rtc.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index adfdf56..2648848 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -16,7 +16,6 @@
  */
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/nmi.h>
 #include <asm/tsc.h>
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 7c3bee6..39d6a3d 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -16,7 +16,6 @@
 #include <linux/timex.h>
 #include <linux/preempt.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 
 #include <asm/processor.h>
 #include <asm/delay.h>
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index e5d5e2c..637ab34 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -11,7 +11,6 @@
 #include <linux/rculist.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
-#include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d0b1773..461bc82 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,7 +8,6 @@
 #include <linux/kthread.h>
 #include <linux/random.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/mm.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index b046e07..bca9e85 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -5,7 +5,6 @@
 #include <linux/delay.h>
 #include <linux/dmi.h>
 #include <linux/pci.h>
-#include <linux/init.h>
 #include <linux/vgaarb.h>
 #include <asm/pci_x86.h>
 
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
index 4f702f5..e0bd082 100644
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
@@ -22,7 +22,6 @@
 #include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/io.h>
 
 #include <asm/fixmap.h>
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index e6cb80f..4d171e8 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -27,7 +27,6 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/pm.h>
 #include <asm/io.h>
 
diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S
index f932ea6..d66c607 100644
--- a/arch/x86/realmode/rm/reboot.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -1,5 +1,4 @@
 #include <linux/linkage.h>
-#include <linux/init.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
 #include <asm/processor-flags.h>
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index c1b2791..48ddd76 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -20,7 +20,6 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/init.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
 #include "realmode.h"
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index bb360dc..dac7b20 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -25,7 +25,6 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/init.h>
 #include <asm/pgtable_types.h>
 #include <asm/page_types.h>
 #include <asm/msr.h>
diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S
index 1cb468a..4b4bd4c 100644
--- a/arch/x86/um/vdso/vdso.S
+++ b/arch/x86/um/vdso/vdso.S
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 
 __INITDATA
 
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 01f5e3b..1e13eb8 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,6 +1,5 @@
 #include <asm/page_types.h>
 #include <linux/linkage.h>
-#include <linux/init.h>
 
 __PAGE_ALIGNED_DATA
 
diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S
index d6b9a7f..295f1c7 100644
--- a/arch/x86/vdso/vdsox32.S
+++ b/arch/x86/vdso/vdsox32.S
@@ -1,6 +1,5 @@
 #include <asm/page_types.h>
 #include <linux/linkage.h>
-#include <linux/init.h>
 
 __PAGE_ALIGNED_DATA
 
-- 
cgit v1.1


From 46184415368a6095d5da33991c5e011f1084353d Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 8 Jan 2014 13:27:51 -0800
Subject: arch: x86: New MailBox support driver for Intel SOC's

Current Intel SOC cores use a MailBox Interface (MBI) to provide access to
configuration registers on devices (called units) connected to the system
fabric. This is a support driver that implements access to this interface on
those platforms that can enumerate the device using PCI. Initial support is for
BayTrail, for which port definitons are provided. This is a requirement for
implementing platform specific features (e.g. RAPL driver requires this to
perform platform specific power management using the registers in PUNIT).
Dependant modules should select IOSF_MBI in their respective Kconfig
configuraiton. Serialized access is handled by all exported routines with
spinlocks.

The API includes 3 functions for access to unit registers:

int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)

port:	indicating the unit being accessed
opcode:	the read or write port specific opcode
offset:	the register offset within the port
mdr:	the register data to be read, written, or modified
mask:	bit locations in mdr to change

Returns nonzero on error

Note: GPU code handles access to the GFX unit. Therefore access to that unit
with this driver is disallowed to avoid conflicts.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: http://lkml.kernel.org/r/1389216471-734-1-git-send-email-david.e.box@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
---
 arch/x86/Kconfig                |   8 ++
 arch/x86/include/asm/iosf_mbi.h |  90 ++++++++++++++++
 arch/x86/kernel/Makefile        |   1 +
 arch/x86/kernel/iosf_mbi.c      | 226 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 325 insertions(+)
 create mode 100644 arch/x86/include/asm/iosf_mbi.h
 create mode 100644 arch/x86/kernel/iosf_mbi.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0952ecd..ca5959a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2393,6 +2393,14 @@ config X86_DMA_REMAP
 	bool
 	depends on STA2X11
 
+config IOSF_MBI
+	bool
+	depends on PCI
+	---help---
+	  To be selected by modules requiring access to the Intel OnChip System
+	  Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms
+	  enumerable by PCI.
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
new file mode 100644
index 0000000..8e71c79
--- /dev/null
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -0,0 +1,90 @@
+/*
+ * iosf_mbi.h: Intel OnChip System Fabric MailBox access support
+ */
+
+#ifndef IOSF_MBI_SYMS_H
+#define IOSF_MBI_SYMS_H
+
+#define MBI_MCR_OFFSET		0xD0
+#define MBI_MDR_OFFSET		0xD4
+#define MBI_MCRX_OFFSET		0xD8
+
+#define MBI_RD_MASK		0xFEFFFFFF
+#define MBI_WR_MASK		0X01000000
+
+#define MBI_MASK_HI		0xFFFFFF00
+#define MBI_MASK_LO		0x000000FF
+#define MBI_ENABLE		0xF0
+
+/* Baytrail available units */
+#define BT_MBI_UNIT_AUNIT	0x00
+#define BT_MBI_UNIT_SMC		0x01
+#define BT_MBI_UNIT_CPU		0x02
+#define BT_MBI_UNIT_BUNIT	0x03
+#define BT_MBI_UNIT_PMC		0x04
+#define BT_MBI_UNIT_GFX		0x06
+#define BT_MBI_UNIT_SMI		0x0C
+#define BT_MBI_UNIT_USB		0x43
+#define BT_MBI_UNIT_SATA	0xA3
+#define BT_MBI_UNIT_PCIE	0xA6
+
+/* Baytrail read/write opcodes */
+#define BT_MBI_AUNIT_READ	0x10
+#define BT_MBI_AUNIT_WRITE	0x11
+#define BT_MBI_SMC_READ		0x10
+#define BT_MBI_SMC_WRITE	0x11
+#define BT_MBI_CPU_READ		0x10
+#define BT_MBI_CPU_WRITE	0x11
+#define BT_MBI_BUNIT_READ	0x10
+#define BT_MBI_BUNIT_WRITE	0x11
+#define BT_MBI_PMC_READ		0x06
+#define BT_MBI_PMC_WRITE	0x07
+#define BT_MBI_GFX_READ		0x00
+#define BT_MBI_GFX_WRITE	0x01
+#define BT_MBI_SMIO_READ	0x06
+#define BT_MBI_SMIO_WRITE	0x07
+#define BT_MBI_USB_READ		0x06
+#define BT_MBI_USB_WRITE	0x07
+#define BT_MBI_SATA_READ	0x00
+#define BT_MBI_SATA_WRITE	0x01
+#define BT_MBI_PCIE_READ	0x00
+#define BT_MBI_PCIE_WRITE	0x01
+
+/**
+ * iosf_mbi_read() - MailBox Interface read command
+ * @port:	port indicating subunit being accessed
+ * @opcode:	port specific read or write opcode
+ * @offset:	register address offset
+ * @mdr:	register data to be read
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr);
+
+/**
+ * iosf_mbi_write() - MailBox unmasked write command
+ * @port:	port indicating subunit being accessed
+ * @opcode:	port specific read or write opcode
+ * @offset:	register address offset
+ * @mdr:	register data to be written
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr);
+
+/**
+ * iosf_mbi_modify() - MailBox masked write command
+ * @port:	port indicating subunit being accessed
+ * @opcode:	port specific read or write opcode
+ * @offset:	register address offset
+ * @mdr:	register data being modified
+ * @mask:	mask indicating bits in mdr to be modified
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
+
+#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9b0a34e..dbe9bd6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
+obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
new file mode 100644
index 0000000..c3aae66
--- /dev/null
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -0,0 +1,226 @@
+/*
+ * IOSF-SB MailBox Interface Driver
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ *
+ * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
+ * mailbox interface (MBI) to communicate with mutiple devices. This
+ * driver implements access to this interface for those platforms that can
+ * enumerate the device using PCI.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include <asm/iosf_mbi.h>
+
+static DEFINE_SPINLOCK(iosf_mbi_lock);
+
+static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
+{
+	return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
+}
+
+static struct pci_dev *mbi_pdev;	/* one mbi device */
+
+static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
+{
+	int result;
+
+	if (!mbi_pdev)
+		return -ENODEV;
+
+	if (mcrx) {
+		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+						mcrx);
+		if (result < 0)
+			goto fail_read;
+	}
+
+	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+	if (result < 0)
+		goto fail_read;
+
+	result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+	if (result < 0)
+		goto fail_read;
+
+	return 0;
+
+fail_read:
+	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+	return result;
+}
+
+static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
+{
+	int result;
+
+	if (!mbi_pdev)
+		return -ENODEV;
+
+	result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+	if (result < 0)
+		goto fail_write;
+
+	if (mcrx) {
+		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+						mcrx);
+		if (result < 0)
+			goto fail_write;
+	}
+
+	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+	if (result < 0)
+		goto fail_write;
+
+	return 0;
+
+fail_write:
+	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+	return result;
+}
+
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+	u32 mcr, mcrx;
+	unsigned long flags;
+	int ret;
+
+	/*Access to the GFX unit is handled by GPU code */
+	if (port == BT_MBI_UNIT_GFX) {
+		WARN_ON(1);
+		return -EPERM;
+	}
+
+	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+	mcrx = offset & MBI_MASK_HI;
+
+	spin_lock_irqsave(&iosf_mbi_lock, flags);
+	ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
+	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_read);
+
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+	u32 mcr, mcrx;
+	unsigned long flags;
+	int ret;
+
+	/*Access to the GFX unit is handled by GPU code */
+	if (port == BT_MBI_UNIT_GFX) {
+		WARN_ON(1);
+		return -EPERM;
+	}
+
+	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+	mcrx = offset & MBI_MASK_HI;
+
+	spin_lock_irqsave(&iosf_mbi_lock, flags);
+	ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
+	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_write);
+
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+	u32 mcr, mcrx;
+	u32 value;
+	unsigned long flags;
+	int ret;
+
+	/*Access to the GFX unit is handled by GPU code */
+	if (port == BT_MBI_UNIT_GFX) {
+		WARN_ON(1);
+		return -EPERM;
+	}
+
+	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+	mcrx = offset & MBI_MASK_HI;
+
+	spin_lock_irqsave(&iosf_mbi_lock, flags);
+
+	/* Read current mdr value */
+	ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
+	if (ret < 0) {
+		spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+		return ret;
+	}
+
+	/* Apply mask */
+	value &= ~mask;
+	mdr &= mask;
+	value |= mdr;
+
+	/* Write back */
+	ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
+
+	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_modify);
+
+static int iosf_mbi_probe(struct pci_dev *pdev,
+			  const struct pci_device_id *unused)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "error: could not enable device\n");
+		return ret;
+	}
+
+	mbi_pdev = pci_dev_get(pdev);
+	return 0;
+}
+
+static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) },
+	{ 0, },
+};
+MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
+
+static struct pci_driver iosf_mbi_pci_driver = {
+	.name		= "iosf_mbi_pci",
+	.probe		= iosf_mbi_probe,
+	.id_table	= iosf_mbi_pci_ids,
+};
+
+static int __init iosf_mbi_init(void)
+{
+	return pci_register_driver(&iosf_mbi_pci_driver);
+}
+
+static void __exit iosf_mbi_exit(void)
+{
+	pci_unregister_driver(&iosf_mbi_pci_driver);
+	if (mbi_pdev) {
+		pci_dev_put(mbi_pdev);
+		mbi_pdev = NULL;
+	}
+}
+
+module_init(iosf_mbi_init);
+module_exit(iosf_mbi_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.1


From 0c3351d451ae2fa438d5d1ed719fc43354fbffbb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 2 Jan 2014 15:11:13 -0800
Subject: seqlock: Use raw_ prefix instead of _no_lockdep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linus disliked the _no_lockdep() naming, so instead
use the more-consistent raw_* prefix to the non-lockdep
enabled seqcount methods.

This also adds raw_ methods for the write operations
as well, which will be utilized in a following patch.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Krzysztof Hałasa <khalasa@piap.pl>
Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: Willy Tarreau <w@1wt.eu>
Link: http://lkml.kernel.org/r/1388704274-5278-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/vdso/vclock_gettime.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 2ada505..eb5d7a5 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -178,7 +178,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
 
 	ts->tv_nsec = 0;
 	do {
-		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+		seq = raw_read_seqcount_begin(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ns = gtod->wall_time_snsec;
@@ -198,7 +198,7 @@ notrace static int do_monotonic(struct timespec *ts)
 
 	ts->tv_nsec = 0;
 	do {
-		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+		seq = raw_read_seqcount_begin(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->monotonic_time_sec;
 		ns = gtod->monotonic_time_snsec;
@@ -214,7 +214,7 @@ notrace static int do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
 	do {
-		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+		seq = raw_read_seqcount_begin(&gtod->seq);
 		ts->tv_sec = gtod->wall_time_coarse.tv_sec;
 		ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -225,7 +225,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
 {
 	unsigned long seq;
 	do {
-		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
+		seq = raw_read_seqcount_begin(&gtod->seq);
 		ts->tv_sec = gtod->monotonic_time_coarse.tv_sec;
 		ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-- 
cgit v1.1


From f228c5b882602697a1adb50d61ff688b0df1eced Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 8 Jan 2014 11:15:53 +0100
Subject: perf/x86/intel: Add Intel RAPL PP1 energy counter support

This patch adds support for the Intel RAPL energy counter
PP1 (Power Plane 1).

On client processors, it usually corresponds to the
energy consumption of the builtin graphic card. That
is why the sysfs event is called energy-gpu.

New event:
 - name: power/energy-gpu/
 - code: event=0x4
 - unit: 2^-32 Joules

On processors without graphics, this should count 0.
The patch only enables this event on client processors.

Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: ak@linux.intel.com
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: zheng.z.yan@intel.com
Cc: bp@alien8.de
Cc: vincent.weaver@maine.edu
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1389176153-3128-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 31 ++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 0e3754e..5ad35ad 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -27,6 +27,10 @@
  *	  event: rapl_energy_dram
  *    perf code: 0x3
  *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ *	  event: rapl_energy_gpu
+ *    perf code: 0x4
+ *
  * We manage those counters as free running (read-only). They may be
  * use simultaneously by other tools, such as turbostat.
  *
@@ -55,10 +59,13 @@
 #define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */
 #define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */
 #define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT	3	/* DRAM */
+#define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
 
 /* Clients have PP0, PKG */
 #define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
-			 1<<RAPL_IDX_PKG_NRG_STAT)
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_PP1_NRG_STAT)
 
 /* Servers have PP0, PKG, RAM */
 #define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\
@@ -315,6 +322,10 @@ static int rapl_pmu_event_init(struct perf_event *event)
 		bit = RAPL_IDX_RAM_NRG_STAT;
 		msr = MSR_DRAM_ENERGY_STATUS;
 		break;
+	case INTEL_RAPL_PP1:
+		bit = RAPL_IDX_PP1_NRG_STAT;
+		msr = MSR_PP1_ENERGY_STATUS;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -367,19 +378,22 @@ static struct attribute_group rapl_pmu_attr_group = {
 };
 
 EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-EVENT_ATTR_STR(energy-pkg  , rapl_pkg, "event=0x02");
-EVENT_ATTR_STR(energy-ram  , rapl_ram, "event=0x03");
+EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 
 EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-EVENT_ATTR_STR(energy-pkg.unit  , rapl_pkg_unit, "Joules");
-EVENT_ATTR_STR(energy-ram.unit  , rapl_ram_unit, "Joules");
+EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 
 /*
  * we compute in 0.23 nJ increments regardless of MSR
  */
 EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 
 static struct attribute *rapl_events_srv_attr[] = {
 	EVENT_PTR(rapl_cores),
@@ -399,12 +413,15 @@ static struct attribute *rapl_events_srv_attr[] = {
 static struct attribute *rapl_events_cln_attr[] = {
 	EVENT_PTR(rapl_cores),
 	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_gpu),
 
 	EVENT_PTR(rapl_cores_unit),
 	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_gpu_unit),
 
 	EVENT_PTR(rapl_cores_scale),
 	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_gpu_scale),
 	NULL,
 };
 
-- 
cgit v1.1


From 47933ad41a86a4a9b50bed7c9b9bd2ba242aac63 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 6 Nov 2013 14:57:36 +0100
Subject: arch: Introduce smp_load_acquire(), smp_store_release()

A number of situations currently require the heavyweight smp_mb(),
even though there is no need to order prior stores against later
loads.  Many architectures have much cheaper ways to handle these
situations, but the Linux kernel currently has no portable way
to make use of them.

This commit therefore supplies smp_load_acquire() and
smp_store_release() to remedy this situation.  The new
smp_load_acquire() primitive orders the specified load against
any subsequent reads or writes, while the new smp_store_release()
primitive orders the specifed store against any prior reads or
writes.  These primitives allow array-based circular FIFOs to be
implemented without an smp_mb(), and also allow a theoretical
hole in rcu_assign_pointer() to be closed at no additional
expense on most architectures.

In addition, the RCU experience transitioning from explicit
smp_read_barrier_depends() and smp_wmb() to rcu_dereference()
and rcu_assign_pointer(), respectively resulted in substantial
improvements in readability.  It therefore seems likely that
replacing other explicit barriers with smp_load_acquire() and
smp_store_release() will provide similar benefits.  It appears
that roughly half of the explicit barriers in core kernel code
might be so replaced.

[Changelog by PaulMck]

Reviewed-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Michael Ellerman <michael@ellerman.id.au>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Victor Kaplansky <VICTORK@il.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/20131213150640.908486364@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/barrier.h | 43 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index c6cd358..04a4890 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -92,12 +92,53 @@
 #endif
 #define smp_read_barrier_depends()	read_barrier_depends()
 #define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else
+#else /* !SMP */
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
 #define smp_read_barrier_depends()	do { } while (0)
 #define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif /* SMP */
+
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+
+/*
+ * For either of these options x86 doesn't have a strong TSO memory
+ * model and we should fall back to full barriers.
+ */
+
+#define smp_store_release(p, v)						\
+do {									\
+	compiletime_assert_atomic_type(*p);				\
+	smp_mb();							\
+	ACCESS_ONCE(*p) = (v);						\
+} while (0)
+
+#define smp_load_acquire(p)						\
+({									\
+	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
+	compiletime_assert_atomic_type(*p);				\
+	smp_mb();							\
+	___p1;								\
+})
+
+#else /* regular x86 TSO memory ordering */
+
+#define smp_store_release(p, v)						\
+do {									\
+	compiletime_assert_atomic_type(*p);				\
+	barrier();							\
+	ACCESS_ONCE(*p) = (v);						\
+} while (0)
+
+#define smp_load_acquire(p)						\
+({									\
+	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
+	compiletime_assert_atomic_type(*p);				\
+	barrier();							\
+	___p1;								\
+})
+
 #endif
 
 /*
-- 
cgit v1.1


From 9345005f4eed805308193658d12e4e7e9c261e74 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Sun, 5 Jan 2014 11:10:52 -0500
Subject: x86/irq: Fix do_IRQ() interrupt warning for cpu hotplug retriggered
 irqs

During heavy CPU-hotplug operations the following spurious kernel warnings
can trigger:

  do_IRQ: No ... irq handler for vector (irq -1)

  [ See: https://bugzilla.kernel.org/show_bug.cgi?id=64831 ]

When downing a cpu it is possible that there are unhandled irqs
left in the APIC IRR register.  The following code path shows
how the problem can occur:

 1. CPU 5 is to go down.

 2. cpu_disable() on CPU 5 executes with interrupt flag cleared
    by local_irq_save() via stop_machine().

 3. IRQ 12 asserts on CPU 5, setting IRR but not ISR because
    interrupt flag is cleared (CPU unabled to handle the irq)

 4. IRQs are migrated off of CPU 5, and the vectors' irqs are set
    to -1. 5. stop_machine() finishes cpu_disable()

 6. cpu_die() for CPU 5 executes in normal context.

 7. CPU 5 attempts to handle IRQ 12 because the IRR is set for
    IRQ 12.  The code attempts to find the vector's IRQ and cannot
    because it has been set to -1. 8. do_IRQ() warning displays
    warning about CPU 5 IRQ 12.

I added a debug printk to output which CPU & vector was
retriggered and discovered that that we are getting bogus
events.  I see a 100% correlation between this debug printk in
fixup_irqs() and the do_IRQ() warning.

This patchset resolves this by adding definitions for
VECTOR_UNDEFINED(-1) and VECTOR_RETRIGGERED(-2) and modifying
the code to use them.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=64831
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Reviewed-by: Rui Wang <rui.y.wang@intel.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Yang Zhang <yang.z.zhang@Intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: janet.morgan@Intel.com
Cc: tony.luck@Intel.com
Cc: ruiv.wang@gmail.com
Link: http://lkml.kernel.org/r/1388938252-16627-1-git-send-email-prarit@redhat.com
[ Cleaned up the code a bit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/hw_irq.h  |  3 +++
 arch/x86/kernel/apic/io_apic.c | 18 +++++++++---------
 arch/x86/kernel/irq.c          | 19 +++++++++++++------
 arch/x86/kernel/irqinit.c      |  4 ++--
 4 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index cba45d9..67d69b8 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -191,6 +191,9 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
 #define trace_interrupt interrupt
 #endif
 
+#define VECTOR_UNDEFINED	-1
+#define VECTOR_RETRIGGERED	-2
+
 typedef int vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
 extern void setup_vector_irq(int cpu);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e63a5bd..6df0b66 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1142,9 +1142,10 @@ next:
 		if (test_bit(vector, used_vectors))
 			goto next;
 
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
-			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+			if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
 				goto next;
+		}
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
@@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 
 	vector = cfg->vector;
 	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = -1;
+		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 
 	cfg->vector = 0;
 	cpumask_clear(cfg->domain);
@@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 	if (likely(!cfg->move_in_progress))
 		return;
 	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
-		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
-								vector++) {
+		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
 				continue;
-			per_cpu(vector_irq, cpu)[vector] = -1;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 			break;
 		}
 	}
@@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu)
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
 		irq = per_cpu(vector_irq, cpu)[vector];
-		if (irq < 0)
+		if (irq <= VECTOR_UNDEFINED)
 			continue;
 
 		cfg = irq_cfg(irq);
 		if (!cpumask_test_cpu(cpu, cfg->domain))
-			per_cpu(vector_irq, cpu)[vector] = -1;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 	}
 	raw_spin_unlock(&vector_lock);
 }
@@ -2208,7 +2208,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 		struct irq_cfg *cfg;
 		irq = __this_cpu_read(vector_irq[vector]);
 
-		if (irq == -1)
+		if (irq <= VECTOR_UNDEFINED)
 			continue;
 
 		desc = irq_to_desc(irq);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687..884d875 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	if (!handle_irq(irq, regs)) {
 		ack_APIC_irq();
 
-		if (printk_ratelimit())
-			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
-				__func__, smp_processor_id(), vector, irq);
+		if (irq != VECTOR_RETRIGGERED) {
+			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+					     __func__, smp_processor_id(),
+					     vector, irq);
+		} else {
+			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+		}
 	}
 
 	irq_exit();
@@ -344,7 +348,7 @@ void fixup_irqs(void)
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irr;
 
-		if (__this_cpu_read(vector_irq[vector]) < 0)
+		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
 			continue;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -355,11 +359,14 @@ void fixup_irqs(void)
 			data = irq_desc_get_irq_data(desc);
 			chip = irq_data_get_irq_chip(data);
 			raw_spin_lock(&desc->lock);
-			if (chip->irq_retrigger)
+			if (chip->irq_retrigger) {
 				chip->irq_retrigger(data);
+				__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+			}
 			raw_spin_unlock(&desc->lock);
 		}
-		__this_cpu_write(vector_irq[vector], -1);
+		if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
 	}
 }
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc..7f50156 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
 };
 
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... NR_VECTORS - 1] = -1,
+	[0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
 };
 
 int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] != -1)
+		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
 			return 1;
 	}
 
-- 
cgit v1.1


From 4f75d8412792777a314ac5c1393a9ed43d695fd1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Dec 2013 18:05:02 +0100
Subject: x86, mce: Fix mce_start_timer semantics

So mce_start_timer() has a 'cpu' argument which is supposed to mean to
start a timer on that cpu. However, the code currently starts a timer on
the *current* cpu the function runs on and causes the sanity-check in
mce_timer_fn to fire:

	WARNING: CPU: 0 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:1286 mce_timer_fn

because it is running on the wrong cpu.

This was triggered by Prarit Bhargava <prarit@redhat.com> by offlining
all the cpus in succession.

Then, we were fiddling with the CMCI storm settings when starting the
timer whereas there's no need for that - if there's storm happening
on this newly restarted cpu, we're going to be in normal CMCI mode
initially and then when the CMCI interrupt starts firing, we're going to
go to the polling mode with the timer real soon.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Reviewed-by: Chen, Gong <gong.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/1387722156-5511-1-git-send-email-prarit@redhat.com
---
 arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a389c1d..4d5419b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 
 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 {
-	unsigned long iv = mce_adjust_timer(check_interval * HZ);
-
-	__this_cpu_write(mce_next_interval, iv);
+	unsigned long iv = check_interval * HZ;
 
 	if (mca_cfg.ignore_ce || !iv)
 		return;
 
+	per_cpu(mce_next_interval, cpu) = iv;
+
 	t->expires = round_jiffies(jiffies + iv);
-	add_timer_on(t, smp_processor_id());
+	add_timer_on(t, cpu);
 }
 
 static void __mcheck_cpu_init_timer(void)
-- 
cgit v1.1


From 106553aa4ead528b1bab37fb3cc47004a4451acc Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sun, 12 Jan 2014 15:52:21 +0100
Subject: um, x86: Fix vDSO build

Commit 663b55b9b39 ("x86: Delete non-required instances of include <linux/init.h>")
broke the UML build.

  arch/x86/um/vdso/vdso.S: Assembler messages:
  arch/x86/um/vdso/vdso.S:2: Error: no such instruction: `__initdata'
  arch/x86/um/vdso/vdso.S:9: Error: no such instruction: `__finit'

UML's vDSO needs linux/init.h.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: user-mode-linux-devel@lists.sourceforge.net
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/1389538341-31383-1-git-send-email-richard@nod.at
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/um/vdso/vdso.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S
index 4b4bd4c..1cb468a 100644
--- a/arch/x86/um/vdso/vdso.S
+++ b/arch/x86/um/vdso/vdso.S
@@ -1,3 +1,4 @@
+#include <linux/init.h>
 
 __INITDATA
 
-- 
cgit v1.1


From d50dde5a10f305253cbc3855307f608f8a3c5f73 Mon Sep 17 00:00:00 2001
From: Dario Faggioli <raistlin@linux.it>
Date: Thu, 7 Nov 2013 14:43:36 +0100
Subject: sched: Add new scheduler syscalls to support an extended scheduling
 parameters ABI

Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).

In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:

 - a (maximum/typical) instance execution time,
 - a minimum interval between consecutive instances,
 - a time constraint by which each instance must be completed.

Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.

For these reasons, this patch:

 - defines the new struct sched_attr, containing all the fields
   that are necessary for specifying a task in the computational
   model described above;

 - defines and implements the new scheduling related syscalls that
   manipulate it, i.e., sched_setattr() and sched_getattr().

Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.

Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.

Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/syscalls/syscall_32.tbl | 2 ++
 arch/x86/syscalls/syscall_64.tbl | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index aabfb83..96bc506 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,5 @@
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
 349	i386	kcmp			sys_kcmp
 350	i386	finit_module		sys_finit_module
+351	i386	sched_setattr		sys_sched_setattr
+352	i386	sched_getattr		sys_sched_getattr
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65d..a12bddc 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,8 @@
 311	64	process_vm_writev	sys_process_vm_writev
 312	common	kcmp			sys_kcmp
 313	common	finit_module		sys_finit_module
+314	common	sched_setattr		sys_sched_setattr
+315	common	sched_getattr		sys_sched_getattr
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-- 
cgit v1.1


From 5dd12c2152743747ca9f50ef80281e54cc416dc0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 Nov 2013 18:04:39 +0100
Subject: sched/clock, x86: Use mul_u64_u32_shr() for native_sched_clock()

Use mul_u64_u32_shr() so that x86_64 can use a single 64x64->128 mul.

Before:

0000000000000560 <native_sched_clock>:
 560:   44 8b 1d 00 00 00 00    mov    0x0(%rip),%r11d        # 567 <native_sched_clock+0x7>
 567:   55                      push   %rbp
 568:   48 89 e5                mov    %rsp,%rbp
 56b:   45 85 db                test   %r11d,%r11d
 56e:   75 4f                   jne    5bf <native_sched_clock+0x5f>
 570:   0f 31                   rdtsc
 572:   89 c0                   mov    %eax,%eax
 574:   48 c1 e2 20             shl    $0x20,%rdx
 578:   48 c7 c1 00 00 00 00    mov    $0x0,%rcx
 57f:   48 09 c2                or     %rax,%rdx
 582:   48 c7 c7 00 00 00 00    mov    $0x0,%rdi
 589:   65 8b 04 25 00 00 00    mov    %gs:0x0,%eax
 590:   00
 591:   48 98                   cltq
 593:   48 8b 34 c5 00 00 00    mov    0x0(,%rax,8),%rsi
 59a:   00
 59b:   48 89 d0                mov    %rdx,%rax
 59e:   81 e2 ff 03 00 00       and    $0x3ff,%edx
 5a4:   48 c1 e8 0a             shr    $0xa,%rax
 5a8:   48 0f af 14 0e          imul   (%rsi,%rcx,1),%rdx
 5ad:   48 0f af 04 0e          imul   (%rsi,%rcx,1),%rax
 5b2:   5d                      pop    %rbp
 5b3:   48 03 04 3e             add    (%rsi,%rdi,1),%rax
 5b7:   48 c1 ea 0a             shr    $0xa,%rdx
 5bb:   48 01 d0                add    %rdx,%rax
 5be:   c3                      retq

After:

0000000000000550 <native_sched_clock>:
 550:   8b 3d 00 00 00 00       mov    0x0(%rip),%edi        # 556 <native_sched_clock+0x6>
 556:   55                      push   %rbp
 557:   48 89 e5                mov    %rsp,%rbp
 55a:   48 83 e4 f0             and    $0xfffffffffffffff0,%rsp
 55e:   85 ff                   test   %edi,%edi
 560:   75 2c                   jne    58e <native_sched_clock+0x3e>
 562:   0f 31                   rdtsc
 564:   89 c0                   mov    %eax,%eax
 566:   48 c1 e2 20             shl    $0x20,%rdx
 56a:   48 09 c2                or     %rax,%rdx
 56d:   65 48 8b 04 25 00 00    mov    %gs:0x0,%rax
 574:   00 00
 576:   89 c0                   mov    %eax,%eax
 578:   48 f7 e2                mul    %rdx
 57b:   65 48 8b 0c 25 00 00    mov    %gs:0x0,%rcx
 582:   00 00
 584:   c9                      leaveq
 585:   48 0f ac d0 0a          shrd   $0xa,%rdx,%rax
 58a:   48 01 c8                add    %rcx,%rax
 58d:   c3                      retq

                        MAINLINE   POST

    sched_clock_stable: 1	   1
    (cold) sched_clock: 329841     331312
    (cold) local_clock: 301773     310296
    (warm) sched_clock: 38375      38247
    (warm) local_clock: 100371     102713
    (warm) rdtsc:       27340      27289
    sched_clock_stable: 0          0
    (cold) sched_clock: 382634     372706
    (cold) local_clock: 396890     399275
    (warm) sched_clock: 38194      38124
    (warm) local_clock: 143452     148698
    (warm) rdtsc:       27345      27365

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-piu203ses5y1g36bnyw2n16x@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/timer.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 34baa0e..10a78c0 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -4,6 +4,7 @@
 #include <linux/pm.h>
 #include <linux/percpu.h>
 #include <linux/interrupt.h>
+#include <linux/math64.h>
 
 #define TICK_SIZE (tick_nsec / 1000)
 
@@ -57,10 +58,8 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
 
 static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
 {
-	int cpu = smp_processor_id();
-	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
-	ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
-			(1UL << CYC2NS_SCALE_FACTOR));
+	unsigned long long ns = this_cpu_read(cyc2ns_offset);
+	ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR);
 	return ns;
 }
 
-- 
cgit v1.1


From 57c67da274f3fab38e08d2c9edf08b89e1d9c71d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 Nov 2013 15:39:25 +0100
Subject: sched/clock, x86: Move some cyc2ns() code around

There are no __cycles_2_ns() users outside of arch/x86/kernel/tsc.c,
so move it there.

There are no cycles_2_ns() users.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-01lslnavfgo3kmbo4532zlcj@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/timer.h |  59 -----------------------
 arch/x86/kernel/tsc.c        | 112 +++++++++++++++++++++++--------------------
 2 files changed, 61 insertions(+), 110 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 10a78c0..b4c6676 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -13,66 +13,7 @@ extern int recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
 
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- *
- * In:
- *
- * ns = cycles * cyc2ns_scale / SC
- *
- * Although we may still have enough bits to store the value of ns,
- * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
- * leading to an incorrect result.
- *
- * To avoid this, we can decompose 'cycles' into quotient and remainder
- * of division by SC.  Then,
- *
- * ns = (quot * SC + rem) * cyc2ns_scale / SC
- *    = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
- *
- *			- sqazi@google.com
- */
-
 DECLARE_PER_CPU(unsigned long, cyc2ns);
 DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
-{
-	unsigned long long ns = this_cpu_read(cyc2ns_offset);
-	ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR);
-	return ns;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	unsigned long long ns;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	ns = __cycles_2_ns(cyc);
-	local_irq_restore(flags);
-
-	return ns;
-}
-
 #endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d4..b4a04ac 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -38,6 +38,66 @@ static int __read_mostly tsc_unstable;
 static int __read_mostly tsc_disabled = -1;
 
 int tsc_clocksource_reliable;
+
+/* Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *              ns = cycles / (freq / ns_per_sec)
+ *              ns = cycles * (ns_per_sec / freq)
+ *              ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *              ns = cycles * (10^6 / cpu_khz)
+ *
+ *      Then we use scaling math (suggested by george@mvista.com) to get:
+ *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *              ns = cycles * cyc2ns_scale / SC
+ *
+ *      And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better precision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+DEFINE_PER_CPU(unsigned long, cyc2ns);
+DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	unsigned long long ns = this_cpu_read(cyc2ns_offset);
+	ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR);
+	return ns;
+}
+
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+	unsigned long long tsc_now, ns_now, *offset;
+	unsigned long flags, *scale;
+
+	local_irq_save(flags);
+	sched_clock_idle_sleep_event();
+
+	scale = &per_cpu(cyc2ns, cpu);
+	offset = &per_cpu(cyc2ns_offset, cpu);
+
+	rdtscll(tsc_now);
+	ns_now = cycles_2_ns(tsc_now);
+
+	if (cpu_khz) {
+		*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
+				cpu_khz / 2) / cpu_khz;
+		*offset = ns_now - mult_frac(tsc_now, *scale,
+					     (1UL << CYC2NS_SCALE_FACTOR));
+	}
+
+	sched_clock_idle_wakeup_event(0);
+	local_irq_restore(flags);
+}
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
@@ -62,7 +122,7 @@ u64 native_sched_clock(void)
 	rdtscll(this_offset);
 
 	/* return the value in ns */
-	return __cycles_2_ns(this_offset);
+	return cycles_2_ns(this_offset);
 }
 
 /* We need to define a real function for sched_clock, to override the
@@ -589,56 +649,6 @@ int recalibrate_cpu_khz(void)
 EXPORT_SYMBOL(recalibrate_cpu_khz);
 
 
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *              ns = cycles / (freq / ns_per_sec)
- *              ns = cycles * (ns_per_sec / freq)
- *              ns = cycles * (10^9 / (cpu_khz * 10^3))
- *              ns = cycles * (10^6 / cpu_khz)
- *
- *      Then we use scaling math (suggested by george@mvista.com) to get:
- *              ns = cycles * (10^6 * SC / cpu_khz) / SC
- *              ns = cycles * cyc2ns_scale / SC
- *
- *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
-
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
-	unsigned long long tsc_now, ns_now, *offset;
-	unsigned long flags, *scale;
-
-	local_irq_save(flags);
-	sched_clock_idle_sleep_event();
-
-	scale = &per_cpu(cyc2ns, cpu);
-	offset = &per_cpu(cyc2ns_offset, cpu);
-
-	rdtscll(tsc_now);
-	ns_now = __cycles_2_ns(tsc_now);
-
-	if (cpu_khz) {
-		*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
-				cpu_khz / 2) / cpu_khz;
-		*offset = ns_now - mult_frac(tsc_now, *scale,
-					     (1UL << CYC2NS_SCALE_FACTOR));
-	}
-
-	sched_clock_idle_wakeup_event(0);
-	local_irq_restore(flags);
-}
-
 static unsigned long long cyc2ns_suspend;
 
 void tsc_save_sched_clock_state(void)
-- 
cgit v1.1


From c7a730fa4624092e2d1c0cb7b750816e87c32364 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Mon, 13 Jan 2014 08:40:20 -0500
Subject: x86/irq: Fix kbuild warning in smp_irq_move_cleanup_interrupt()

Fengguang Wu's 0day kernel build service reported the following build warning:

  arch/x86/kernel/apic/io_apic.c:2211
  smp_irq_move_cleanup_interrupt() warn: always true condition '(irq <= -1) => (0-u32max <= (-1))'

because irq is defined as an unsigned int instead of an int.
Fix this trivial error by redefining irq as a signed int.  The
remaining consumers of the int are okay.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Link: http://lkml.kernel.org/r/1389620420-7110-1-git-send-email-prarit@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/io_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6df0b66..a43f068 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2202,7 +2202,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		unsigned int irq;
+		int irq;
 		unsigned int irr;
 		struct irq_desc *desc;
 		struct irq_cfg *cfg;
-- 
cgit v1.1


From 20d1c86a57762f0a33a78988e3fc8818316badd4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 Nov 2013 15:40:29 +0100
Subject: sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs

Use a ring-buffer like multi-version object structure which allows
always having a coherent object; we use this to avoid having to
disable IRQs while reading sched_clock() and avoids a problem when
getting an NMI while changing the cyc2ns data.

                        MAINLINE   PRE        POST

    sched_clock_stable: 1          1          1
    (cold) sched_clock: 329841     331312     257223
    (cold) local_clock: 301773     310296     309889
    (warm) sched_clock: 38375      38247      25280
    (warm) local_clock: 100371     102713     85268
    (warm) rdtsc:       27340      27289      24247
    sched_clock_stable: 0          0          0
    (cold) sched_clock: 382634     372706     301224
    (cold) local_clock: 396890     399275     399870
    (warm) sched_clock: 38194      38124      25630
    (warm) local_clock: 143452     148698     129629
    (warm) rdtsc:       27345      27365      24307

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-s567in1e5ekq2nlyhn8f987r@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/timer.h     |  23 +++-
 arch/x86/kernel/cpu/perf_event.c |  14 ++-
 arch/x86/kernel/tsc.c            | 229 +++++++++++++++++++++++++++++++++++----
 arch/x86/platform/uv/tlb_uv.c    |  66 ++++++-----
 4 files changed, 276 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index b4c6676..3de54ef 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -13,7 +13,26 @@ extern int recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
 
-DECLARE_PER_CPU(unsigned long, cyc2ns);
-DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
+/*
+ * We use the full linear equation: f(x) = a + b*x, in order to allow
+ * a continuous function in the face of dynamic freq changes.
+ *
+ * Continuity means that when our frequency changes our slope (b); we want to
+ * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
+ *
+ * Without an offset (a) the above would not be possible.
+ *
+ * See the comment near cycles_2_ns() for details on how we compute (b).
+ */
+struct cyc2ns_data {
+	u32 cyc2ns_mul;
+	u32 cyc2ns_shift;
+	u64 cyc2ns_offset;
+	u32 __count;
+	/* u32 hole */
+}; /* 24 bytes -- do not grow */
+
+extern struct cyc2ns_data *cyc2ns_read_begin(void);
+extern void cyc2ns_read_end(struct cyc2ns_data *);
 
 #endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e13293..9f97bd0 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1883,6 +1883,8 @@ static struct pmu pmu = {
 
 void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
+	struct cyc2ns_data *data;
+
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
 	userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
@@ -1891,13 +1893,17 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 	if (!sched_clock_stable)
 		return;
 
+	data = cyc2ns_read_begin();
+
 	userpg->cap_user_time = 1;
-	userpg->time_mult = this_cpu_read(cyc2ns);
-	userpg->time_shift = CYC2NS_SCALE_FACTOR;
-	userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+	userpg->time_mult = data->cyc2ns_mul;
+	userpg->time_shift = data->cyc2ns_shift;
+	userpg->time_offset = data->cyc2ns_offset - now;
 
 	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = this_cpu_read(cyc2ns_offset);
+	userpg->time_zero = data->cyc2ns_offset;
+
+	cyc2ns_read_end(data);
 }
 
 /*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b4a04ac..92b090b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -39,7 +39,119 @@ static int __read_mostly tsc_disabled = -1;
 
 int tsc_clocksource_reliable;
 
-/* Accelerators for sched_clock()
+/*
+ * Use a ring-buffer like data structure, where a writer advances the head by
+ * writing a new data entry and a reader advances the tail when it observes a
+ * new entry.
+ *
+ * Writers are made to wait on readers until there's space to write a new
+ * entry.
+ *
+ * This means that we can always use an {offset, mul} pair to compute a ns
+ * value that is 'roughly' in the right direction, even if we're writing a new
+ * {offset, mul} pair during the clock read.
+ *
+ * The down-side is that we can no longer guarantee strict monotonicity anymore
+ * (assuming the TSC was that to begin with), because while we compute the
+ * intersection point of the two clock slopes and make sure the time is
+ * continuous at the point of switching; we can no longer guarantee a reader is
+ * strictly before or after the switch point.
+ *
+ * It does mean a reader no longer needs to disable IRQs in order to avoid
+ * CPU-Freq updates messing with his times, and similarly an NMI reader will
+ * no longer run the risk of hitting half-written state.
+ */
+
+struct cyc2ns {
+	struct cyc2ns_data data[2];	/*  0 + 2*24 = 48 */
+	struct cyc2ns_data *head;	/* 48 + 8    = 56 */
+	struct cyc2ns_data *tail;	/* 56 + 8    = 64 */
+}; /* exactly fits one cacheline */
+
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+
+struct cyc2ns_data *cyc2ns_read_begin(void)
+{
+	struct cyc2ns_data *head;
+
+	preempt_disable();
+
+	head = this_cpu_read(cyc2ns.head);
+	/*
+	 * Ensure we observe the entry when we observe the pointer to it.
+	 * matches the wmb from cyc2ns_write_end().
+	 */
+	smp_read_barrier_depends();
+	head->__count++;
+	barrier();
+
+	return head;
+}
+
+void cyc2ns_read_end(struct cyc2ns_data *head)
+{
+	barrier();
+	/*
+	 * If we're the outer most nested read; update the tail pointer
+	 * when we're done. This notifies possible pending writers
+	 * that we've observed the head pointer and that the other
+	 * entry is now free.
+	 */
+	if (!--head->__count) {
+		/*
+		 * x86-TSO does not reorder writes with older reads;
+		 * therefore once this write becomes visible to another
+		 * cpu, we must be finished reading the cyc2ns_data.
+		 *
+		 * matches with cyc2ns_write_begin().
+		 */
+		this_cpu_write(cyc2ns.tail, head);
+	}
+	preempt_enable();
+}
+
+/*
+ * Begin writing a new @data entry for @cpu.
+ *
+ * Assumes some sort of write side lock; currently 'provided' by the assumption
+ * that cpufreq will call its notifiers sequentially.
+ */
+static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+{
+	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+	struct cyc2ns_data *data = c2n->data;
+
+	if (data == c2n->head)
+		data++;
+
+	/* XXX send an IPI to @cpu in order to guarantee a read? */
+
+	/*
+	 * When we observe the tail write from cyc2ns_read_end(),
+	 * the cpu must be done with that entry and its safe
+	 * to start writing to it.
+	 */
+	while (c2n->tail == data)
+		cpu_relax();
+
+	return data;
+}
+
+static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+{
+	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+
+	/*
+	 * Ensure the @data writes are visible before we publish the
+	 * entry. Matches the data-depencency in cyc2ns_read_begin().
+	 */
+	smp_wmb();
+
+	ACCESS_ONCE(c2n->head) = data;
+}
+
+/*
+ * Accelerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
  *  basic equation:
  *              ns = cycles / (freq / ns_per_sec)
@@ -61,49 +173,106 @@ int tsc_clocksource_reliable;
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
-
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
+static void cyc2ns_data_init(struct cyc2ns_data *data)
+{
+	data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_offset = 0;
+	data->__count = 0;
+}
+
+static void cyc2ns_init(int cpu)
+{
+	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+
+	cyc2ns_data_init(&c2n->data[0]);
+	cyc2ns_data_init(&c2n->data[1]);
+
+	c2n->head = c2n->data;
+	c2n->tail = c2n->data;
+}
+
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
-	unsigned long long ns = this_cpu_read(cyc2ns_offset);
-	ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR);
+	struct cyc2ns_data *data, *tail;
+	unsigned long long ns;
+
+	/*
+	 * See cyc2ns_read_*() for details; replicated in order to avoid
+	 * an extra few instructions that came with the abstraction.
+	 * Notable, it allows us to only do the __count and tail update
+	 * dance when its actually needed.
+	 */
+
+	preempt_disable();
+	data = this_cpu_read(cyc2ns.head);
+	tail = this_cpu_read(cyc2ns.tail);
+
+	if (likely(data == tail)) {
+		ns = data->cyc2ns_offset;
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+	} else {
+		data->__count++;
+
+		barrier();
+
+		ns = data->cyc2ns_offset;
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+
+		barrier();
+
+		if (!--data->__count)
+			this_cpu_write(cyc2ns.tail, data);
+	}
+	preempt_enable();
+
 	return ns;
 }
 
+/* XXX surely we already have this someplace in the kernel?! */
+#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
+
 static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 {
-	unsigned long long tsc_now, ns_now, *offset;
-	unsigned long flags, *scale;
+	unsigned long long tsc_now, ns_now;
+	struct cyc2ns_data *data;
+	unsigned long flags;
 
 	local_irq_save(flags);
 	sched_clock_idle_sleep_event();
 
-	scale = &per_cpu(cyc2ns, cpu);
-	offset = &per_cpu(cyc2ns_offset, cpu);
+	if (!cpu_khz)
+		goto done;
+
+	data = cyc2ns_write_begin(cpu);
 
 	rdtscll(tsc_now);
 	ns_now = cycles_2_ns(tsc_now);
 
-	if (cpu_khz) {
-		*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
-				cpu_khz / 2) / cpu_khz;
-		*offset = ns_now - mult_frac(tsc_now, *scale,
-					     (1UL << CYC2NS_SCALE_FACTOR));
-	}
+	/*
+	 * Compute a new multiplier as per the above comment and ensure our
+	 * time function is continuous; see the comment near struct
+	 * cyc2ns_data.
+	 */
+	data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_offset = ns_now -
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+
+	cyc2ns_write_end(cpu, data);
 
+done:
 	sched_clock_idle_wakeup_event(0);
 	local_irq_restore(flags);
 }
-
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
 u64 native_sched_clock(void)
 {
-	u64 this_offset;
+	u64 tsc_now;
 
 	/*
 	 * Fall back to jiffies if there's no TSC available:
@@ -119,10 +288,10 @@ u64 native_sched_clock(void)
 	}
 
 	/* read the Time Stamp Counter: */
-	rdtscll(this_offset);
+	rdtscll(tsc_now);
 
 	/* return the value in ns */
-	return cycles_2_ns(this_offset);
+	return cycles_2_ns(tsc_now);
 }
 
 /* We need to define a real function for sched_clock, to override the
@@ -678,11 +847,21 @@ void tsc_restore_sched_clock_state(void)
 
 	local_irq_save(flags);
 
-	__this_cpu_write(cyc2ns_offset, 0);
+	/*
+	 * We're comming out of suspend, there's no concurrency yet; don't
+	 * bother being nice about the RCU stuff, just write to both
+	 * data fields.
+	 */
+
+	this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+	this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
+
 	offset = cyc2ns_suspend - sched_clock();
 
-	for_each_possible_cpu(cpu)
-		per_cpu(cyc2ns_offset, cpu) = offset;
+	for_each_possible_cpu(cpu) {
+		per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+		per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+	}
 
 	local_irq_restore(flags);
 }
@@ -1005,8 +1184,10 @@ void __init tsc_init(void)
 	 * speed as the bootup CPU. (cpufreq notifiers will fix this
 	 * up if their speed diverges)
 	 */
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+		cyc2ns_init(cpu);
 		set_cyc2ns_scale(cpu_khz, cpu);
+	}
 
 	if (tsc_disabled > 0)
 		return;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index efe4d72..dfe605a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
 	return;
 }
 
-static inline unsigned long cycles_2_us(unsigned long long cyc)
+/*
+ * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
+ * number, not an absolute. It converts a duration in cycles to a duration in
+ * ns.
+ */
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
+	struct cyc2ns_data *data = cyc2ns_read_begin();
 	unsigned long long ns;
-	unsigned long us;
-	int cpu = smp_processor_id();
 
-	ns =  (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
-	us = ns / 1000;
-	return us;
+	ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+
+	cyc2ns_read_end(data);
+	return ns;
+}
+
+/*
+ * The reverse of the above; converts a duration in ns to a duration in cycles.
+ */ 
+static inline unsigned long long ns_2_cycles(unsigned long long ns)
+{
+	struct cyc2ns_data *data = cyc2ns_read_begin();
+	unsigned long long cyc;
+
+	cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+
+	cyc2ns_read_end(data);
+	return cyc;
+}
+
+static inline unsigned long cycles_2_us(unsigned long long cyc)
+{
+	return cycles_2_ns(cyc) / NSEC_PER_USEC;
+}
+
+static inline cycles_t sec_2_cycles(unsigned long sec)
+{
+	return ns_2_cycles(sec * NSEC_PER_SEC);
+}
+
+static inline unsigned long long usec_2_cycles(unsigned long usec)
+{
+	return ns_2_cycles(usec * NSEC_PER_USEC);
 }
 
 /*
@@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc,
 								bcp, try);
 }
 
-static inline cycles_t sec_2_cycles(unsigned long sec)
-{
-	unsigned long ns;
-	cycles_t cyc;
-
-	ns = sec * 1000000000;
-	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-	return cyc;
-}
-
 /*
  * Our retries are blocked by all destination sw ack resources being
  * in use, and a timeout is pending. In that case hardware immediately
@@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
 
-static inline unsigned long long usec_2_cycles(unsigned long microsec)
-{
-	unsigned long ns;
-	unsigned long long cyc;
-
-	ns = microsec * 1000;
-	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-	return cyc;
-}
-
 /*
  * Display the statistics thru /proc/sgi_uv/ptc_statistics
  * 'data' points to the cpu number
-- 
cgit v1.1


From 35af99e646c7f7ea46dc2977601e9e71a51dadd5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2013 19:38:42 +0100
Subject: sched/clock, x86: Use a static_key for sched_clock_stable

In order to avoid the runtime condition and variable load turn
sched_clock_stable into a static_key.

Also provide a shorter implementation of local_clock() and
cpu_clock(int) when sched_clock_stable==1.

                        MAINLINE   PRE       POST

    sched_clock_stable: 1          1         1
    (cold) sched_clock: 329841     221876    215295
    (cold) local_clock: 301773     234692    220773
    (warm) sched_clock: 38375      25602     25659
    (warm) local_clock: 100371     33265     27242
    (warm) rdtsc:       27340      24214     24208
    sched_clock_stable: 0          0         0
    (cold) sched_clock: 382634     235941    237019
    (cold) local_clock: 396890     297017    294819
    (warm) sched_clock: 38194      25233     25609
    (warm) local_clock: 143452     71234     71232
    (warm) rdtsc:       27345      24245     24243

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-eummbdechzz37mwmpags1gjr@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c        | 2 +-
 arch/x86/kernel/cpu/intel.c      | 2 +-
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 arch/x86/kernel/tsc.c            | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023b..8bc79cd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
 		if (!check_tsc_unstable())
-			sched_clock_stable = 1;
+			set_sched_clock_stable();
 	}
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ea04b34..1a439c0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
 		if (!check_tsc_unstable())
-			sched_clock_stable = 1;
+			set_sched_clock_stable();
 	}
 
 	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9f97bd0..b886451 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1890,7 +1890,7 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 	userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
 	userpg->pmc_width = x86_pmu.cntval_bits;
 
-	if (!sched_clock_stable)
+	if (!sched_clock_stable())
 		return;
 
 	data = cyc2ns_read_begin();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 92b090b..53c1235 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -822,7 +822,7 @@ static unsigned long long cyc2ns_suspend;
 
 void tsc_save_sched_clock_state(void)
 {
-	if (!sched_clock_stable)
+	if (!sched_clock_stable())
 		return;
 
 	cyc2ns_suspend = sched_clock();
@@ -842,7 +842,7 @@ void tsc_restore_sched_clock_state(void)
 	unsigned long flags;
 	int cpu;
 
-	if (!sched_clock_stable)
+	if (!sched_clock_stable())
 		return;
 
 	local_irq_save(flags);
@@ -984,7 +984,7 @@ void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
-		sched_clock_stable = 0;
+		clear_sched_clock_stable();
 		disable_sched_clock_irqtime();
 		pr_info("Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
-- 
cgit v1.1


From 10b033d434c25a6c9e0f4f4dc2418af1b8236c63 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2013 19:01:40 +0100
Subject: sched/clock, x86: Avoid a runtime condition in native_sched_clock()

Use a static_key to avoid touching tsc_disabled and a runtime
condition in native_sched_clock() -- less cachelines touched is always
better.

                        MAINLINE   PRE       POST

    sched_clock_stable: 1          1         1
    (cold) sched_clock: 329841     215295    213039
    (cold) local_clock: 301773     220773    216084
    (warm) sched_clock: 38375      25659     25231
    (warm) local_clock: 100371     27242     27601
    (warm) rdtsc:       27340      24208     24203
    sched_clock_stable: 0          0         0
    (cold) sched_clock: 382634     237019    240055
    (cold) local_clock: 396890     294819    299942
    (warm) sched_clock: 38194      25609     25276
    (warm) local_clock: 143452     71232     73232
    (warm) rdtsc:       27345      24243     24244

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-hrz87bo37qke25bty6pnfy4b@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 53c1235..6377fb2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
 #include <linux/timex.h>
+#include <linux/static_key.h>
 
 #include <asm/hpet.h>
 #include <asm/timer.h>
@@ -37,6 +38,8 @@ static int __read_mostly tsc_unstable;
    erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
 
+static struct static_key __use_tsc = STATIC_KEY_INIT;
+
 int tsc_clocksource_reliable;
 
 /*
@@ -282,7 +285,7 @@ u64 native_sched_clock(void)
 	 *   very important for it to be as fast as the platform
 	 *   can achieve it. )
 	 */
-	if (unlikely(tsc_disabled)) {
+	if (!static_key_false(&__use_tsc)) {
 		/* No locking but a rare wrong value is not a big deal: */
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 	}
@@ -1193,7 +1196,9 @@ void __init tsc_init(void)
 		return;
 
 	/* now allow native_sched_clock() to use rdtsc */
+
 	tsc_disabled = 0;
+	static_key_slow_inc(&__use_tsc);
 
 	if (!no_sched_irq_time)
 		enable_sched_clock_irqtime();
-- 
cgit v1.1


From 8cb75e0c4ec9786b81439761eac1d18d4a931af3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Nov 2013 12:22:37 +0100
Subject: sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding

With various drivers wanting to inject idle time; we get people
calling idle routines outside of the idle loop proper.

Therefore we need to be extra careful about not missing
TIF_NEED_RESCHED -> PREEMPT_NEED_RESCHED propagations.

While looking at this, I also realized there's a small window in the
existing idle loop where we can miss TIF_NEED_RESCHED; when it hits
right after the tif_need_resched() test at the end of the loop but
right before the need_resched() test at the start of the loop.

So move preempt_fold_need_resched() out of the loop where we're
guaranteed to have TIF_NEED_RESCHED set.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-x9jgh45oeayzajz2mjt0y7d6@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mwait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 19b71c4..1da25a5 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -53,7 +53,7 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 		if (!need_resched())
 			__mwait(eax, ecx);
 	}
-	__current_clr_polling();
+	current_clr_polling();
 }
 
 #endif /* _ASM_X86_MWAIT_H */
-- 
cgit v1.1


From 5aa3d718f259007121b9366d36315fb8a2983d3d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 Dec 2013 20:50:42 +0100
Subject: x86, ramdisk: Export relocated ramdisk VA

The ramdisk can possibly get relocated if the whole image is not mapped.
And since we're going over it in the microcode loader and fishing out
the relevant microcode patches, we want access it at its new location.
Thus, export it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
---
 arch/x86/include/asm/setup.h |  3 ++-
 arch/x86/kernel/setup.c      | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 59bcf4e..d62c9f8 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -3,7 +3,6 @@
 
 #include <uapi/asm/setup.h>
 
-
 #define COMMAND_LINE_SIZE 2048
 
 #include <linux/linkage.h>
@@ -29,6 +28,8 @@
 #include <asm/bootparam.h>
 #include <asm/x86_init.h>
 
+extern u64 relocated_ramdisk;
+
 /* Interrupt control for vSMPowered x86_64 systems */
 #ifdef CONFIG_X86_64
 void vsmp_init(void);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb233bc..baefc6d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -295,6 +295,8 @@ static void __init reserve_brk(void)
 	_brk_start = 0;
 }
 
+u64 relocated_ramdisk;
+
 #ifdef CONFIG_BLK_DEV_INITRD
 
 static u64 __init get_ramdisk_image(void)
@@ -321,25 +323,24 @@ static void __init relocate_initrd(void)
 	u64 ramdisk_image = get_ramdisk_image();
 	u64 ramdisk_size  = get_ramdisk_size();
 	u64 area_size     = PAGE_ALIGN(ramdisk_size);
-	u64 ramdisk_here;
 	unsigned long slop, clen, mapaddr;
 	char *p, *q;
 
 	/* We need to move the initrd down into directly mapped mem */
-	ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
-						 area_size, PAGE_SIZE);
+	relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+						   area_size, PAGE_SIZE);
 
-	if (!ramdisk_here)
+	if (!relocated_ramdisk)
 		panic("Cannot find place for new RAMDISK of size %lld\n",
-			 ramdisk_size);
+		      ramdisk_size);
 
 	/* Note: this includes all the mem currently occupied by
 	   the initrd, we rely on that fact to keep the data intact. */
-	memblock_reserve(ramdisk_here, area_size);
-	initrd_start = ramdisk_here + PAGE_OFFSET;
+	memblock_reserve(relocated_ramdisk, area_size);
+	initrd_start = relocated_ramdisk + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
 	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
-			 ramdisk_here, ramdisk_here + ramdisk_size - 1);
+	       relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
 
 	q = (char *)initrd_start;
 
@@ -363,7 +364,7 @@ static void __init relocate_initrd(void)
 	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
 		" [mem %#010llx-%#010llx]\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
-		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+		relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
 }
 
 static void __init early_reserve_initrd(void)
-- 
cgit v1.1


From e1b43e3f13f7157249fb962ccf88b84eb0421fb4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 Dec 2013 12:31:31 +0100
Subject: x86, microcode: Share native MSR accessing variants

We want to use those in AMD's early loading path too. Also, add a
native_wrmsrl variant.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
---
 arch/x86/include/asm/microcode.h        | 15 +++++++++++++++
 arch/x86/kernel/microcode_intel_early.c | 10 ----------
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index f98bd66..b59827e 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,21 @@
 #ifndef _ASM_X86_MICROCODE_H
 #define _ASM_X86_MICROCODE_H
 
+#define native_rdmsr(msr, val1, val2)			\
+do {							\
+	u64 __val = native_read_msr((msr));		\
+	(void)((val1) = (u32)__val);			\
+	(void)((val2) = (u32)(__val >> 32));		\
+} while (0)
+
+#define native_wrmsr(msr, low, high)			\
+	native_write_msr(msr, low, high)
+
+#define native_wrmsrl(msr, val)				\
+	native_write_msr((msr),				\
+			 (u32)((u64)(val)),		\
+			 (u32)((u64)(val) >> 32))
+
 struct cpu_signature {
 	unsigned int sig;
 	unsigned int pf;
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c
index 1575deb..18f7391 100644
--- a/arch/x86/kernel/microcode_intel_early.c
+++ b/arch/x86/kernel/microcode_intel_early.c
@@ -365,16 +365,6 @@ out:
 	return state;
 }
 
-#define native_rdmsr(msr, val1, val2)		\
-do {						\
-	u64 __val = native_read_msr((msr));	\
-	(void)((val1) = (u32)__val);		\
-	(void)((val2) = (u32)(__val >> 32));	\
-} while (0)
-
-#define native_wrmsr(msr, low, high)		\
-	native_write_msr(msr, low, high);
-
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
 	unsigned int val[2];
-- 
cgit v1.1


From 5335ba5cf475369f88db8e6835764efdcad8ab96 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 29 Nov 2013 14:58:44 +0100
Subject: x86, microcode, AMD: Fix early ucode loading

The original idea to use the microcode cache for the APs doesn't pan out
because we do memory allocation there very early and with IRQs disabled
and we don't want to involve GFP_ATOMIC allocations. Not if it can be
helped.

Thus, extend the caching of the BSP patch approach to the APs and
iterate over the ucode in the initrd instead of using the cache. We
still save the relevant patches to it but later, right before we
jettison the initrd.

While at it, fix early ucode loading on 32-bit too.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
---
 arch/x86/include/asm/microcode_amd.h  |   7 +-
 arch/x86/kernel/microcode_amd.c       |  13 +-
 arch/x86/kernel/microcode_amd_early.c | 239 ++++++++++++++++++++++------------
 3 files changed, 170 insertions(+), 89 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 4c01917..b7b10b8 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -61,11 +61,10 @@ extern int __apply_microcode_amd(struct microcode_amd *mc_amd);
 extern int apply_microcode_amd(int cpu);
 extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
 
+#define PATCH_MAX_SIZE PAGE_SIZE
+extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
+
 #ifdef CONFIG_MICROCODE_AMD_EARLY
-#ifdef CONFIG_X86_32
-#define MPB_MAX_SIZE PAGE_SIZE
-extern u8 amd_bsp_mpb[MPB_MAX_SIZE];
-#endif
 extern void __init load_ucode_amd_bsp(void);
 extern void load_ucode_amd_ap(void);
 extern int __init save_microcode_in_initrd_amd(void);
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c3d4cc9..4a6ff74 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd)
 {
 	u32 rev, dummy;
 
-	wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
 
 	/* verify patch application was successful */
-	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
 	if (rev != mc_amd->hdr.patch_id)
 		return -1;
 
@@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
 	patch->patch_id  = mc_hdr->patch_id;
 	patch->equiv_cpu = proc_id;
 
+	pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+		 __func__, patch->patch_id, proc_id);
+
 	/* ... and add to cache. */
 	update_cache(patch);
 
@@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
 	if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
 		struct ucode_patch *p = find_patch(smp_processor_id());
 		if (p) {
-			memset(amd_bsp_mpb, 0, MPB_MAX_SIZE);
-			memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data),
-							   MPB_MAX_SIZE));
+			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
+							       PATCH_MAX_SIZE));
 		}
 	}
 #endif
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c
index 6073104..8384c0f 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/microcode_amd_early.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 Advanced Micro Devices, Inc.
  *
  * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -15,10 +16,18 @@
 #include <asm/setup.h>
 #include <asm/microcode_amd.h>
 
-static bool ucode_loaded;
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
+
 static u32 ucode_new_rev;
-static unsigned long ucode_offset;
-static size_t ucode_size;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u16 this_equiv_id;
+
+struct cpio_data ucode_cpio;
 
 /*
  * Microcode patch container file is prepended to the initrd in cpio format.
@@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void)
 	char *path;
 	void *start;
 	size_t size;
-	unsigned long *uoffset;
-	size_t *usize;
-	struct cpio_data cd;
 
 #ifdef CONFIG_X86_32
 	struct boot_params *p;
@@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void)
 	path    = (char *)__pa_nodebug(ucode_path);
 	start   = (void *)p->hdr.ramdisk_image;
 	size    = p->hdr.ramdisk_size;
-	uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
-	usize   = (size_t *)__pa_nodebug(&ucode_size);
 #else
 	path    = ucode_path;
 	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
 	size    = boot_params.hdr.ramdisk_size;
-	uoffset = &ucode_offset;
-	usize   = &ucode_size;
 #endif
 
-	cd = find_cpio_data(path, start, size, &offset);
-	if (!cd.data)
-		return cd;
+	return find_cpio_data(path, start, size, &offset);
+}
 
-	if (*(u32 *)cd.data != UCODE_MAGIC) {
-		cd.data = NULL;
-		cd.size = 0;
-		return cd;
-	}
+static size_t compute_container_size(u8 *data, u32 total_size)
+{
+	size_t size = 0;
+	u32 *header = (u32 *)data;
 
-	*uoffset = (u8 *)cd.data - (u8 *)start;
-	*usize   = cd.size;
+	if (header[0] != UCODE_MAGIC ||
+	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+	    header[2] == 0)                            /* size */
+		return size;
 
-	return cd;
+	size = header[2] + CONTAINER_HDR_SZ;
+	total_size -= size;
+	data += size;
+
+	while (total_size) {
+		u16 patch_size;
+
+		header = (u32 *)data;
+
+		if (header[0] != UCODE_UCODE_TYPE)
+			break;
+
+		/*
+		 * Sanity-check patch size.
+		 */
+		patch_size = header[1];
+		if (patch_size > PATCH_MAX_SIZE)
+			break;
+
+		size	   += patch_size + SECTION_HDR_SIZE;
+		data	   += patch_size + SECTION_HDR_SIZE;
+		total_size -= patch_size + SECTION_HDR_SIZE;
+	}
+
+	return size;
 }
 
 /*
@@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void)
 static void apply_ucode_in_initrd(void *ucode, size_t size)
 {
 	struct equiv_cpu_entry *eq;
+	size_t *cont_sz;
 	u32 *header;
-	u8  *data;
+	u8  *data, **cont;
 	u16 eq_id = 0;
 	int offset, left;
-	u32 rev, eax;
+	u32 rev, eax, ebx, ecx, edx;
 	u32 *new_rev;
-	unsigned long *uoffset;
-	size_t *usize;
 
 #ifdef CONFIG_X86_32
 	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-	uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
-	usize   = (size_t *)__pa_nodebug(&ucode_size);
+	cont_sz = (size_t *)__pa_nodebug(&container_size);
+	cont	= (u8 **)__pa_nodebug(&container);
 #else
 	new_rev = &ucode_new_rev;
-	uoffset = &ucode_offset;
-	usize   = &ucode_size;
+	cont_sz = &container_size;
+	cont	= &container;
 #endif
 
 	data   = ucode;
@@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
 	header = (u32 *)data;
 
 	/* find equiv cpu table */
-
-	if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+	if (header[0] != UCODE_MAGIC ||
+	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
 	    header[2] == 0)                            /* size */
 		return;
 
-	eax = cpuid_eax(0x00000001);
+	eax = 0x00000001;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
 
 	while (left > 0) {
 		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
 
+		*cont = data;
+
+		/* Advance past the container header */
 		offset = header[2] + CONTAINER_HDR_SZ;
 		data  += offset;
 		left  -= offset;
 
 		eq_id = find_equiv_id(eq, eax);
-		if (eq_id)
+		if (eq_id) {
+			this_equiv_id = eq_id;
+			*cont_sz = compute_container_size(*cont, left + offset);
+
+			/*
+			 * truncate how much we need to iterate over in the
+			 * ucode update loop below
+			 */
+			left = *cont_sz - offset;
 			break;
+		}
 
 		/*
 		 * support multiple container files appended together. if this
@@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
 
 		/* mark where the next microcode container file starts */
 		offset    = data - (u8 *)ucode;
-		*uoffset += offset;
-		*usize   -= offset;
 		ucode     = data;
 	}
 
 	if (!eq_id) {
-		*usize = 0;
+		*cont = NULL;
+		*cont_sz = 0;
 		return;
 	}
 
 	/* find ucode and update if needed */
 
-	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
 
 	while (left > 0) {
 		struct microcode_amd *mc;
@@ -168,73 +206,83 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
 			break;
 
 		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id)
-			if (__apply_microcode_amd(mc) == 0) {
+
+		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+
+			if (!__apply_microcode_amd(mc)) {
 				rev = mc->hdr.patch_id;
 				*new_rev = rev;
+
+				/* save ucode patch */
+				memcpy(amd_ucode_patch, mc,
+				       min_t(u32, header[1], PATCH_MAX_SIZE));
 			}
+		}
 
 		offset  = header[1] + SECTION_HDR_SIZE;
 		data   += offset;
 		left   -= offset;
 	}
-
-	/* mark where this microcode container file ends */
-	offset  = *usize - (data - (u8 *)ucode);
-	*usize -= offset;
-
-	if (!(*new_rev))
-		*usize = 0;
 }
 
 void __init load_ucode_amd_bsp(void)
 {
-	struct cpio_data cd = find_ucode_in_initrd();
-	if (!cd.data)
+	struct cpio_data cp;
+	void **data;
+	size_t *size;
+
+#ifdef CONFIG_X86_32
+	data =  (void **)__pa_nodebug(&ucode_cpio.data);
+	size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+	data = &ucode_cpio.data;
+	size = &ucode_cpio.size;
+#endif
+
+	cp = find_ucode_in_initrd();
+	if (!cp.data)
 		return;
 
-	apply_ucode_in_initrd(cd.data, cd.size);
+	*data = cp.data;
+	*size = cp.size;
+
+	apply_ucode_in_initrd(cp.data, cp.size);
 }
 
 #ifdef CONFIG_X86_32
-u8 amd_bsp_mpb[MPB_MAX_SIZE];
-
 /*
  * On 32-bit, since AP's early load occurs before paging is turned on, we
  * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
  * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which
- * is used upon resume from suspend.
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * which is used upon resume from suspend.
  */
 void load_ucode_amd_ap(void)
 {
 	struct microcode_amd *mc;
-	unsigned long *initrd;
-	unsigned long *uoffset;
 	size_t *usize;
-	void *ucode;
+	void **ucode;
 
-	mc = (struct microcode_amd *)__pa(amd_bsp_mpb);
+	mc = (struct microcode_amd *)__pa(amd_ucode_patch);
 	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
 		__apply_microcode_amd(mc);
 		return;
 	}
 
-	initrd  = (unsigned long *)__pa(&initrd_start);
-	uoffset = (unsigned long *)__pa(&ucode_offset);
-	usize   = (size_t *)__pa(&ucode_size);
+	ucode = (void *)__pa_nodebug(&container);
+	usize = (size_t *)__pa_nodebug(&container_size);
 
-	if (!*usize || !*initrd)
+	if (!*ucode || !*usize)
 		return;
 
-	ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset);
-	apply_ucode_in_initrd(ucode, *usize);
+	apply_ucode_in_initrd(*ucode, *usize);
 }
 
 static void __init collect_cpu_sig_on_bsp(void *arg)
 {
 	unsigned int cpu = smp_processor_id();
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
 	uci->cpu_sig.sig = cpuid_eax(0x00000001);
 }
 #else
@@ -242,36 +290,54 @@ void load_ucode_amd_ap(void)
 {
 	unsigned int cpu = smp_processor_id();
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct equiv_cpu_entry *eq;
+	struct microcode_amd *mc;
 	u32 rev, eax;
+	u16 eq_id;
+
+	/* Exit if called on the BSP. */
+	if (!cpu)
+		return;
+
+	if (!container)
+		return;
 
 	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-	eax = cpuid_eax(0x00000001);
 
 	uci->cpu_sig.rev = rev;
 	uci->cpu_sig.sig = eax;
 
-	if (cpu && !ucode_loaded) {
-		void *ucode;
+	eax = cpuid_eax(0x00000001);
+	eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
 
-		if (!ucode_size || !initrd_start)
-			return;
+	eq_id = find_equiv_id(eq, eax);
+	if (!eq_id)
+		return;
+
+	if (eq_id == this_equiv_id) {
+		mc = (struct microcode_amd *)amd_ucode_patch;
 
-		ucode = (void *)(initrd_start + ucode_offset);
-		eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
-		if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK)
+		if (mc && rev < mc->hdr.patch_id) {
+			if (!__apply_microcode_amd(mc))
+				ucode_new_rev = mc->hdr.patch_id;
+		}
+
+	} else {
+		if (!ucode_cpio.data)
 			return;
 
-		ucode_loaded = true;
+		/*
+		 * AP has a different equivalence ID than BSP, looks like
+		 * mixed-steppings silicon so go through the ucode blob anew.
+		 */
+		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
 	}
-
-	apply_microcode_amd(cpu);
 }
 #endif
 
 int __init save_microcode_in_initrd_amd(void)
 {
 	enum ucode_state ret;
-	void *ucode;
 	u32 eax;
 
 #ifdef CONFIG_X86_32
@@ -280,22 +346,35 @@ int __init save_microcode_in_initrd_amd(void)
 
 	if (!uci->cpu_sig.sig)
 		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+
+	/*
+	 * Take into account the fact that the ramdisk might get relocated
+	 * and therefore we need to recompute the container's position in
+	 * virtual memory space.
+	 */
+	container = (u8 *)(__va((u32)relocated_ramdisk) +
+			   ((u32)container - boot_params.hdr.ramdisk_image));
 #endif
 	if (ucode_new_rev)
 		pr_info("microcode: updated early to new patch_level=0x%08x\n",
 			ucode_new_rev);
 
-	if (ucode_loaded || !ucode_size || !initrd_start)
-		return 0;
+	if (!container)
+		return -EINVAL;
 
-	ucode = (void *)(initrd_start + ucode_offset);
 	eax   = cpuid_eax(0x00000001);
 	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 
-	ret = load_microcode_amd(eax, ucode, ucode_size);
+	ret = load_microcode_amd(eax, container, container_size);
 	if (ret != UCODE_OK)
 		return -EINVAL;
 
-	ucode_loaded = true;
+	/*
+	 * This will be freed any msec now, stash patches for the current
+	 * family and switch to patch cache for cpu hotplug, etc later.
+	 */
+	container = NULL;
+	container_size = 0;
+
 	return 0;
 }
-- 
cgit v1.1


From bad5fa631fca5466401cd4a48e30cc1f1cb6101e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 1 Dec 2013 18:09:58 +0100
Subject: x86, microcode: Move to a proper location

We've grown a bunch of microcode loader files all prefixed with
"microcode_". They should be under cpu/ because this is strictly
CPU-related functionality so do that and drop the prefix since they're
in their own directory now which gives that prefix. :)

While at it, drop MICROCODE_INTEL_LIB config item and stash the
functionality under CONFIG_MICROCODE_INTEL as it was its only user.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
---
 arch/x86/Kconfig                            |   4 -
 arch/x86/kernel/Makefile                    |   9 -
 arch/x86/kernel/cpu/Makefile                |   1 +
 arch/x86/kernel/cpu/microcode/Makefile      |   7 +
 arch/x86/kernel/cpu/microcode/amd.c         | 492 +++++++++++++++++
 arch/x86/kernel/cpu/microcode/amd_early.c   | 380 ++++++++++++++
 arch/x86/kernel/cpu/microcode/core.c        | 645 +++++++++++++++++++++++
 arch/x86/kernel/cpu/microcode/core_early.c  | 141 +++++
 arch/x86/kernel/cpu/microcode/intel.c       | 333 ++++++++++++
 arch/x86/kernel/cpu/microcode/intel_early.c | 787 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/microcode/intel_lib.c   | 174 ++++++
 arch/x86/kernel/microcode_amd.c             | 492 -----------------
 arch/x86/kernel/microcode_amd_early.c       | 380 --------------
 arch/x86/kernel/microcode_core.c            | 645 -----------------------
 arch/x86/kernel/microcode_core_early.c      | 141 -----
 arch/x86/kernel/microcode_intel.c           | 333 ------------
 arch/x86/kernel/microcode_intel_early.c     | 787 ----------------------------
 arch/x86/kernel/microcode_intel_lib.c       | 174 ------
 18 files changed, 2960 insertions(+), 2965 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/microcode/Makefile
 create mode 100644 arch/x86/kernel/cpu/microcode/amd.c
 create mode 100644 arch/x86/kernel/cpu/microcode/amd_early.c
 create mode 100644 arch/x86/kernel/cpu/microcode/core.c
 create mode 100644 arch/x86/kernel/cpu/microcode/core_early.c
 create mode 100644 arch/x86/kernel/cpu/microcode/intel.c
 create mode 100644 arch/x86/kernel/cpu/microcode/intel_early.c
 create mode 100644 arch/x86/kernel/cpu/microcode/intel_lib.c
 delete mode 100644 arch/x86/kernel/microcode_amd.c
 delete mode 100644 arch/x86/kernel/microcode_amd_early.c
 delete mode 100644 arch/x86/kernel/microcode_core.c
 delete mode 100644 arch/x86/kernel/microcode_core_early.c
 delete mode 100644 arch/x86/kernel/microcode_intel.c
 delete mode 100644 arch/x86/kernel/microcode_intel_early.c
 delete mode 100644 arch/x86/kernel/microcode_intel_lib.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0952ecd..01bfb9e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1080,10 +1080,6 @@ config MICROCODE_OLD_INTERFACE
 	def_bool y
 	depends on MICROCODE
 
-config MICROCODE_INTEL_LIB
-	def_bool y
-	depends on MICROCODE_INTEL
-
 config MICROCODE_INTEL_EARLY
 	def_bool n
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9b0a34e..4d0094d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -91,15 +91,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
-obj-$(CONFIG_MICROCODE_EARLY)		+= microcode_core_early.o
-obj-$(CONFIG_MICROCODE_INTEL_EARLY)	+= microcode_intel_early.o
-obj-$(CONFIG_MICROCODE_INTEL_LIB)	+= microcode_intel_lib.o
-microcode-y				:= microcode_core.o
-microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o
-microcode-$(CONFIG_MICROCODE_AMD)	+= microcode_amd.o
-obj-$(CONFIG_MICROCODE_AMD_EARLY)	+= microcode_amd_early.o
-obj-$(CONFIG_MICROCODE)			+= microcode.o
-
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7..0710eef 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -42,6 +42,7 @@ endif
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
+obj-$(CONFIG_MICROCODE)			+= microcode/
 
 obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o perf_event_amd_ibs.o
 
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 0000000..285c854
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,7 @@
+microcode-y				:= core.o
+obj-$(CONFIG_MICROCODE)			+= microcode.o
+microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_AMD)	+= amd.o
+obj-$(CONFIG_MICROCODE_EARLY)		+= core_early.o
+obj-$(CONFIG_MICROCODE_INTEL_EARLY)	+= intel_early.o
+obj-$(CONFIG_MICROCODE_AMD_EARLY)	+= amd_early.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
new file mode 100644
index 0000000..4a6ff74
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -0,0 +1,492 @@
+/*
+ *  AMD CPU Microcode Update Driver for Linux
+ *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ *
+ *  Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ *  Based on work by:
+ *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *
+ *  Maintainers:
+ *  Andreas Herrmann <herrmann.der.user@googlemail.com>
+ *  Borislav Petkov <bp@alien8.de>
+ *
+ *  This driver allows to upgrade microcode on F10h AMD
+ *  CPUs and later.
+ *
+ *  Licensed under the terms of the GNU General Public
+ *  License version 2. See file COPYING for details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/firmware.h>
+#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/microcode_amd.h>
+
+MODULE_DESCRIPTION("AMD Microcode Update Driver");
+MODULE_AUTHOR("Peter Oruba");
+MODULE_LICENSE("GPL v2");
+
+static struct equiv_cpu_entry *equiv_cpu_table;
+
+struct ucode_patch {
+	struct list_head plist;
+	void *data;
+	u32 patch_id;
+	u16 equiv_cpu;
+};
+
+static LIST_HEAD(pcache);
+
+static u16 __find_equiv_id(unsigned int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig);
+}
+
+static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
+{
+	int i = 0;
+
+	BUG_ON(!equiv_cpu_table);
+
+	while (equiv_cpu_table[i].equiv_cpu != 0) {
+		if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
+			return equiv_cpu_table[i].installed_cpu;
+		i++;
+	}
+	return 0;
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+{
+	struct ucode_patch *p;
+
+	list_for_each_entry(p, &pcache, plist)
+		if (p->equiv_cpu == equiv_cpu)
+			return p;
+	return NULL;
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+	struct ucode_patch *p;
+
+	list_for_each_entry(p, &pcache, plist) {
+		if (p->equiv_cpu == new_patch->equiv_cpu) {
+			if (p->patch_id >= new_patch->patch_id)
+				/* we already have the latest patch */
+				return;
+
+			list_replace(&p->plist, &new_patch->plist);
+			kfree(p->data);
+			kfree(p);
+			return;
+		}
+	}
+	/* no patch found, add it */
+	list_add_tail(&new_patch->plist, &pcache);
+}
+
+static void free_cache(void)
+{
+	struct ucode_patch *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &pcache, plist) {
+		__list_del(p->plist.prev, p->plist.next);
+		kfree(p->data);
+		kfree(p);
+	}
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+	u16 equiv_id;
+
+	equiv_id = __find_equiv_id(cpu);
+	if (!equiv_id)
+		return NULL;
+
+	return cache_find_patch(equiv_id);
+}
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct ucode_patch *p;
+
+	csig->sig = cpuid_eax(0x00000001);
+	csig->rev = c->microcode;
+
+	/*
+	 * a patch could have been loaded early, set uci->mc so that
+	 * mc_bp_resume() can call apply_microcode()
+	 */
+	p = find_patch(cpu);
+	if (p && (p->patch_id == csig->rev))
+		uci->mc = p->data;
+
+	pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
+
+	return 0;
+}
+
+static unsigned int verify_patch_size(u8 family, u32 patch_size,
+				      unsigned int size)
+{
+	u32 max_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+#define F16H_MPB_MAX_SIZE 3458
+
+	switch (family) {
+	case 0x14:
+		max_size = F14H_MPB_MAX_SIZE;
+		break;
+	case 0x15:
+		max_size = F15H_MPB_MAX_SIZE;
+		break;
+	case 0x16:
+		max_size = F16H_MPB_MAX_SIZE;
+		break;
+	default:
+		max_size = F1XH_MPB_MAX_SIZE;
+		break;
+	}
+
+	if (patch_size > min_t(u32, size, max_size)) {
+		pr_err("patch size mismatch\n");
+		return 0;
+	}
+
+	return patch_size;
+}
+
+int __apply_microcode_amd(struct microcode_amd *mc_amd)
+{
+	u32 rev, dummy;
+
+	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+
+	/* verify patch application was successful */
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+	if (rev != mc_amd->hdr.patch_id)
+		return -1;
+
+	return 0;
+}
+
+int apply_microcode_amd(int cpu)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	struct microcode_amd *mc_amd;
+	struct ucode_cpu_info *uci;
+	struct ucode_patch *p;
+	u32 rev, dummy;
+
+	BUG_ON(raw_smp_processor_id() != cpu);
+
+	uci = ucode_cpu_info + cpu;
+
+	p = find_patch(cpu);
+	if (!p)
+		return 0;
+
+	mc_amd  = p->data;
+	uci->mc = p->data;
+
+	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+	/* need to apply patch? */
+	if (rev >= mc_amd->hdr.patch_id) {
+		c->microcode = rev;
+		uci->cpu_sig.rev = rev;
+		return 0;
+	}
+
+	if (__apply_microcode_amd(mc_amd)) {
+		pr_err("CPU%d: update failed for patch_level=0x%08x\n",
+			cpu, mc_amd->hdr.patch_id);
+		return -1;
+	}
+	pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
+		mc_amd->hdr.patch_id);
+
+	uci->cpu_sig.rev = mc_amd->hdr.patch_id;
+	c->microcode = mc_amd->hdr.patch_id;
+
+	return 0;
+}
+
+static int install_equiv_cpu_table(const u8 *buf)
+{
+	unsigned int *ibuf = (unsigned int *)buf;
+	unsigned int type = ibuf[1];
+	unsigned int size = ibuf[2];
+
+	if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+		pr_err("empty section/"
+		       "invalid type field in container file section header\n");
+		return -EINVAL;
+	}
+
+	equiv_cpu_table = vmalloc(size);
+	if (!equiv_cpu_table) {
+		pr_err("failed to allocate equivalent CPU table\n");
+		return -ENOMEM;
+	}
+
+	memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
+
+	/* add header length */
+	return size + CONTAINER_HDR_SZ;
+}
+
+static void free_equiv_cpu_table(void)
+{
+	vfree(equiv_cpu_table);
+	equiv_cpu_table = NULL;
+}
+
+static void cleanup(void)
+{
+	free_equiv_cpu_table();
+	free_cache();
+}
+
+/*
+ * We return the current size even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
+{
+	struct microcode_header_amd *mc_hdr;
+	struct ucode_patch *patch;
+	unsigned int patch_size, crnt_size, ret;
+	u32 proc_fam;
+	u16 proc_id;
+
+	patch_size  = *(u32 *)(fw + 4);
+	crnt_size   = patch_size + SECTION_HDR_SIZE;
+	mc_hdr	    = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+	proc_id	    = mc_hdr->processor_rev_id;
+
+	proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
+	if (!proc_fam) {
+		pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
+		return crnt_size;
+	}
+
+	/* check if patch is for the current family */
+	proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
+	if (proc_fam != family)
+		return crnt_size;
+
+	if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+		pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
+			mc_hdr->patch_id);
+		return crnt_size;
+	}
+
+	ret = verify_patch_size(family, patch_size, leftover);
+	if (!ret) {
+		pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
+		return crnt_size;
+	}
+
+	patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+	if (!patch) {
+		pr_err("Patch allocation failure.\n");
+		return -EINVAL;
+	}
+
+	patch->data = kzalloc(patch_size, GFP_KERNEL);
+	if (!patch->data) {
+		pr_err("Patch data allocation failure.\n");
+		kfree(patch);
+		return -EINVAL;
+	}
+
+	/* All looks ok, copy patch... */
+	memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
+	INIT_LIST_HEAD(&patch->plist);
+	patch->patch_id  = mc_hdr->patch_id;
+	patch->equiv_cpu = proc_id;
+
+	pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+		 __func__, patch->patch_id, proc_id);
+
+	/* ... and add to cache. */
+	update_cache(patch);
+
+	return crnt_size;
+}
+
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
+					     size_t size)
+{
+	enum ucode_state ret = UCODE_ERROR;
+	unsigned int leftover;
+	u8 *fw = (u8 *)data;
+	int crnt_size = 0;
+	int offset;
+
+	offset = install_equiv_cpu_table(data);
+	if (offset < 0) {
+		pr_err("failed to create equivalent cpu table\n");
+		return ret;
+	}
+	fw += offset;
+	leftover = size - offset;
+
+	if (*(u32 *)fw != UCODE_UCODE_TYPE) {
+		pr_err("invalid type field in container file section header\n");
+		free_equiv_cpu_table();
+		return ret;
+	}
+
+	while (leftover) {
+		crnt_size = verify_and_add_patch(family, fw, leftover);
+		if (crnt_size < 0)
+			return ret;
+
+		fw	 += crnt_size;
+		leftover -= crnt_size;
+	}
+
+	return UCODE_OK;
+}
+
+enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+	enum ucode_state ret;
+
+	/* free old equiv table */
+	free_equiv_cpu_table();
+
+	ret = __load_microcode_amd(family, data, size);
+
+	if (ret != UCODE_OK)
+		cleanup();
+
+#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
+	/* save BSP's matching patch for early load */
+	if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
+		struct ucode_patch *p = find_patch(smp_processor_id());
+		if (p) {
+			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
+							       PATCH_MAX_SIZE));
+		}
+	}
+#endif
+	return ret;
+}
+
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ *    amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Beginning with family 15h, they are in family-specific firmware files:
+ *
+ *    amd-ucode/microcode_amd_fam15h.bin
+ *    amd-ucode/microcode_amd_fam16h.bin
+ *    ...
+ *
+ * These might be larger than 2K.
+ */
+static enum ucode_state request_microcode_amd(int cpu, struct device *device,
+					      bool refresh_fw)
+{
+	char fw_name[36] = "amd-ucode/microcode_amd.bin";
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	enum ucode_state ret = UCODE_NFOUND;
+	const struct firmware *fw;
+
+	/* reload ucode container only on the boot cpu */
+	if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+		return UCODE_OK;
+
+	if (c->x86 >= 0x15)
+		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
+
+	if (request_firmware(&fw, (const char *)fw_name, device)) {
+		pr_debug("failed to load file %s\n", fw_name);
+		goto out;
+	}
+
+	ret = UCODE_ERROR;
+	if (*(u32 *)fw->data != UCODE_MAGIC) {
+		pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
+		goto fw_release;
+	}
+
+	ret = load_microcode_amd(c->x86, fw->data, fw->size);
+
+ fw_release:
+	release_firmware(fw);
+
+ out:
+	return ret;
+}
+
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+	return UCODE_ERROR;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+	.request_microcode_user           = request_microcode_user,
+	.request_microcode_fw             = request_microcode_amd,
+	.collect_cpu_info                 = collect_cpu_info_amd,
+	.apply_microcode                  = apply_microcode_amd,
+	.microcode_fini_cpu               = microcode_fini_cpu_amd,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+		pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+		return NULL;
+	}
+
+	return &microcode_amd_ops;
+}
+
+void __exit exit_amd_microcode(void)
+{
+	cleanup();
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
new file mode 100644
index 0000000..8384c0f
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+#include <asm/cpu.h>
+#include <asm/setup.h>
+#include <asm/microcode_amd.h>
+
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
+
+static u32 ucode_new_rev;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u16 this_equiv_id;
+
+struct cpio_data ucode_cpio;
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio format.
+ * See Documentation/x86/early-microcode.txt
+ */
+static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+
+static struct cpio_data __init find_ucode_in_initrd(void)
+{
+	long offset = 0;
+	char *path;
+	void *start;
+	size_t size;
+
+#ifdef CONFIG_X86_32
+	struct boot_params *p;
+
+	/*
+	 * On 32-bit, early load occurs before paging is turned on so we need
+	 * to use physical addresses.
+	 */
+	p       = (struct boot_params *)__pa_nodebug(&boot_params);
+	path    = (char *)__pa_nodebug(ucode_path);
+	start   = (void *)p->hdr.ramdisk_image;
+	size    = p->hdr.ramdisk_size;
+#else
+	path    = ucode_path;
+	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
+	size    = boot_params.hdr.ramdisk_size;
+#endif
+
+	return find_cpio_data(path, start, size, &offset);
+}
+
+static size_t compute_container_size(u8 *data, u32 total_size)
+{
+	size_t size = 0;
+	u32 *header = (u32 *)data;
+
+	if (header[0] != UCODE_MAGIC ||
+	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+	    header[2] == 0)                            /* size */
+		return size;
+
+	size = header[2] + CONTAINER_HDR_SZ;
+	total_size -= size;
+	data += size;
+
+	while (total_size) {
+		u16 patch_size;
+
+		header = (u32 *)data;
+
+		if (header[0] != UCODE_UCODE_TYPE)
+			break;
+
+		/*
+		 * Sanity-check patch size.
+		 */
+		patch_size = header[1];
+		if (patch_size > PATCH_MAX_SIZE)
+			break;
+
+		size	   += patch_size + SECTION_HDR_SIZE;
+		data	   += patch_size + SECTION_HDR_SIZE;
+		total_size -= patch_size + SECTION_HDR_SIZE;
+	}
+
+	return size;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+static void apply_ucode_in_initrd(void *ucode, size_t size)
+{
+	struct equiv_cpu_entry *eq;
+	size_t *cont_sz;
+	u32 *header;
+	u8  *data, **cont;
+	u16 eq_id = 0;
+	int offset, left;
+	u32 rev, eax, ebx, ecx, edx;
+	u32 *new_rev;
+
+#ifdef CONFIG_X86_32
+	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+	cont_sz = (size_t *)__pa_nodebug(&container_size);
+	cont	= (u8 **)__pa_nodebug(&container);
+#else
+	new_rev = &ucode_new_rev;
+	cont_sz = &container_size;
+	cont	= &container;
+#endif
+
+	data   = ucode;
+	left   = size;
+	header = (u32 *)data;
+
+	/* find equiv cpu table */
+	if (header[0] != UCODE_MAGIC ||
+	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+	    header[2] == 0)                            /* size */
+		return;
+
+	eax = 0x00000001;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	while (left > 0) {
+		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
+
+		*cont = data;
+
+		/* Advance past the container header */
+		offset = header[2] + CONTAINER_HDR_SZ;
+		data  += offset;
+		left  -= offset;
+
+		eq_id = find_equiv_id(eq, eax);
+		if (eq_id) {
+			this_equiv_id = eq_id;
+			*cont_sz = compute_container_size(*cont, left + offset);
+
+			/*
+			 * truncate how much we need to iterate over in the
+			 * ucode update loop below
+			 */
+			left = *cont_sz - offset;
+			break;
+		}
+
+		/*
+		 * support multiple container files appended together. if this
+		 * one does not have a matching equivalent cpu entry, we fast
+		 * forward to the next container file.
+		 */
+		while (left > 0) {
+			header = (u32 *)data;
+			if (header[0] == UCODE_MAGIC &&
+			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
+				break;
+
+			offset = header[1] + SECTION_HDR_SIZE;
+			data  += offset;
+			left  -= offset;
+		}
+
+		/* mark where the next microcode container file starts */
+		offset    = data - (u8 *)ucode;
+		ucode     = data;
+	}
+
+	if (!eq_id) {
+		*cont = NULL;
+		*cont_sz = 0;
+		return;
+	}
+
+	/* find ucode and update if needed */
+
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+
+	while (left > 0) {
+		struct microcode_amd *mc;
+
+		header = (u32 *)data;
+		if (header[0] != UCODE_UCODE_TYPE || /* type */
+		    header[1] == 0)                  /* size */
+			break;
+
+		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
+
+		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+
+			if (!__apply_microcode_amd(mc)) {
+				rev = mc->hdr.patch_id;
+				*new_rev = rev;
+
+				/* save ucode patch */
+				memcpy(amd_ucode_patch, mc,
+				       min_t(u32, header[1], PATCH_MAX_SIZE));
+			}
+		}
+
+		offset  = header[1] + SECTION_HDR_SIZE;
+		data   += offset;
+		left   -= offset;
+	}
+}
+
+void __init load_ucode_amd_bsp(void)
+{
+	struct cpio_data cp;
+	void **data;
+	size_t *size;
+
+#ifdef CONFIG_X86_32
+	data =  (void **)__pa_nodebug(&ucode_cpio.data);
+	size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+	data = &ucode_cpio.data;
+	size = &ucode_cpio.size;
+#endif
+
+	cp = find_ucode_in_initrd();
+	if (!cp.data)
+		return;
+
+	*data = cp.data;
+	*size = cp.size;
+
+	apply_ucode_in_initrd(cp.data, cp.size);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit, since AP's early load occurs before paging is turned on, we
+ * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
+ * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * which is used upon resume from suspend.
+ */
+void load_ucode_amd_ap(void)
+{
+	struct microcode_amd *mc;
+	size_t *usize;
+	void **ucode;
+
+	mc = (struct microcode_amd *)__pa(amd_ucode_patch);
+	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
+		__apply_microcode_amd(mc);
+		return;
+	}
+
+	ucode = (void *)__pa_nodebug(&container);
+	usize = (size_t *)__pa_nodebug(&container_size);
+
+	if (!*ucode || !*usize)
+		return;
+
+	apply_ucode_in_initrd(*ucode, *usize);
+}
+
+static void __init collect_cpu_sig_on_bsp(void *arg)
+{
+	unsigned int cpu = smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	uci->cpu_sig.sig = cpuid_eax(0x00000001);
+}
+#else
+void load_ucode_amd_ap(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct equiv_cpu_entry *eq;
+	struct microcode_amd *mc;
+	u32 rev, eax;
+	u16 eq_id;
+
+	/* Exit if called on the BSP. */
+	if (!cpu)
+		return;
+
+	if (!container)
+		return;
+
+	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+
+	uci->cpu_sig.rev = rev;
+	uci->cpu_sig.sig = eax;
+
+	eax = cpuid_eax(0x00000001);
+	eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+
+	eq_id = find_equiv_id(eq, eax);
+	if (!eq_id)
+		return;
+
+	if (eq_id == this_equiv_id) {
+		mc = (struct microcode_amd *)amd_ucode_patch;
+
+		if (mc && rev < mc->hdr.patch_id) {
+			if (!__apply_microcode_amd(mc))
+				ucode_new_rev = mc->hdr.patch_id;
+		}
+
+	} else {
+		if (!ucode_cpio.data)
+			return;
+
+		/*
+		 * AP has a different equivalence ID than BSP, looks like
+		 * mixed-steppings silicon so go through the ucode blob anew.
+		 */
+		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
+	}
+}
+#endif
+
+int __init save_microcode_in_initrd_amd(void)
+{
+	enum ucode_state ret;
+	u32 eax;
+
+#ifdef CONFIG_X86_32
+	unsigned int bsp = boot_cpu_data.cpu_index;
+	struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
+
+	if (!uci->cpu_sig.sig)
+		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+
+	/*
+	 * Take into account the fact that the ramdisk might get relocated
+	 * and therefore we need to recompute the container's position in
+	 * virtual memory space.
+	 */
+	container = (u8 *)(__va((u32)relocated_ramdisk) +
+			   ((u32)container - boot_params.hdr.ramdisk_image));
+#endif
+	if (ucode_new_rev)
+		pr_info("microcode: updated early to new patch_level=0x%08x\n",
+			ucode_new_rev);
+
+	if (!container)
+		return -EINVAL;
+
+	eax   = cpuid_eax(0x00000001);
+	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+
+	ret = load_microcode_amd(eax, container, container_size);
+	if (ret != UCODE_OK)
+		return -EINVAL;
+
+	/*
+	 * This will be freed any msec now, stash patches for the current
+	 * family and switch to patch cache for cpu hotplug, etc later.
+	 */
+	container = NULL;
+	container_size = 0;
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
new file mode 100644
index 0000000..15c9876
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -0,0 +1,645 @@
+/*
+ *	Intel CPU Microcode Update Driver for Linux
+ *
+ *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
+ *
+ *	This driver allows to upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
+ *
+ *	For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Initial release.
+ *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added read() support + cleanups.
+ *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added 'device trimming' support. open(O_WRONLY) zeroes
+ *		and frees the saved copy of applied microcode.
+ *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Added misc device support (now uses both devfs and misc).
+ *		Added MICROCODE_IOCFREE ioctl to clear memory.
+ *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Messages for error cases (non Intel & no suitable microcode).
+ *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->release(). Removed exclusive open and status bitmap.
+ *		Added microcode_rwsem to serialize read()/write()/ioctl().
+ *		Removed global kernel lock usage.
+ *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Write 0 to 0x8B msr and then cpuid before reading revision,
+ *		so that it works even if there were no update done by the
+ *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *		to be 0 on my machine which is why it worked even when I
+ *		disabled update by the BIOS)
+ *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *			     Tigran Aivazian <tigran@veritas.com>
+ *		Intel Pentium 4 processor support and bugfixes.
+ *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *		Bugfix for HT (Hyper-Threading) enabled processors
+ *		whereby processor resources are shared by all logical processors
+ *		in a single CPU package.
+ *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *		Tigran Aivazian <tigran@veritas.com>,
+ *		Serialize updates as required on HT processors due to
+ *		speculative nature of implementation.
+ *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *		Fix the panic when writing zero-length microcode chunk.
+ *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ *		Jun Nakajima <jun.nakajima@intel.com>
+ *		Support for the microcode updates in the new format.
+ *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *		because we no longer hold a copy of applied microcode
+ *		in kernel memory.
+ *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *		Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *		Thanks to Stuart Swales for pointing out this bug.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+#define MICROCODE_VERSION	"2.00"
+
+static struct microcode_ops	*microcode_ops;
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ *   the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+static DEFINE_MUTEX(microcode_mutex);
+
+struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
+EXPORT_SYMBOL_GPL(ucode_cpu_info);
+
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+	struct cpu_signature	*cpu_sig;
+	int			err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+	struct cpu_info_ctx *ctx = arg;
+
+	ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+						   ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+	struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+	int ret;
+
+	ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+	if (!ret)
+		ret = ctx.err;
+
+	return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	int ret;
+
+	memset(uci, 0, sizeof(*uci));
+
+	ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+	if (!ret)
+		uci->valid = 1;
+
+	return ret;
+}
+
+struct apply_microcode_ctx {
+	int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+	struct apply_microcode_ctx *ctx = arg;
+
+	ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+	struct apply_microcode_ctx ctx = { .err = 0 };
+	int ret;
+
+	ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+	if (!ret)
+		ret = ctx.err;
+
+	return ret;
+}
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *buf, size_t size)
+{
+	int error = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+		enum ucode_state ustate;
+
+		if (!uci->valid)
+			continue;
+
+		ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+		if (ustate == UCODE_ERROR) {
+			error = -1;
+			break;
+		} else if (ustate == UCODE_OK)
+			apply_microcode_on_target(cpu);
+	}
+
+	return error;
+}
+
+static int microcode_open(struct inode *inode, struct file *file)
+{
+	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+			       size_t len, loff_t *ppos)
+{
+	ssize_t ret = -EINVAL;
+
+	if ((len >> PAGE_SHIFT) > totalram_pages) {
+		pr_err("too much data (max %ld pages)\n", totalram_pages);
+		return ret;
+	}
+
+	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
+	if (do_microcode_update(buf, len) == 0)
+		ret = (ssize_t)len;
+
+	if (ret > 0)
+		perf_check_microcode();
+
+	mutex_unlock(&microcode_mutex);
+	put_online_cpus();
+
+	return ret;
+}
+
+static const struct file_operations microcode_fops = {
+	.owner			= THIS_MODULE,
+	.write			= microcode_write,
+	.open			= microcode_open,
+	.llseek		= no_llseek,
+};
+
+static struct miscdevice microcode_dev = {
+	.minor			= MICROCODE_MINOR,
+	.name			= "microcode",
+	.nodename		= "cpu/microcode",
+	.fops			= &microcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+	int error;
+
+	error = misc_register(&microcode_dev);
+	if (error) {
+		pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
+		return error;
+	}
+
+	return 0;
+}
+
+static void __exit microcode_dev_exit(void)
+{
+	misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+MODULE_ALIAS("devname:cpu/microcode");
+#else
+#define microcode_dev_init()	0
+#define microcode_dev_exit()	do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+static struct platform_device	*microcode_pdev;
+
+static int reload_for_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	enum ucode_state ustate;
+	int err = 0;
+
+	if (!uci->valid)
+		return err;
+
+	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
+	if (ustate == UCODE_OK)
+		apply_microcode_on_target(cpu);
+	else
+		if (ustate == UCODE_ERROR)
+			err = -EINVAL;
+	return err;
+}
+
+static ssize_t reload_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t size)
+{
+	unsigned long val;
+	int cpu;
+	ssize_t ret = 0, tmp_ret;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val != 1)
+		return size;
+
+	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+	for_each_online_cpu(cpu) {
+		tmp_ret = reload_for_cpu(cpu);
+		if (tmp_ret != 0)
+			pr_warn("Error reloading microcode on CPU %d\n", cpu);
+
+		/* save retval of the first encountered reload error */
+		if (!ret)
+			ret = tmp_ret;
+	}
+	if (!ret)
+		perf_check_microcode();
+	mutex_unlock(&microcode_mutex);
+	put_online_cpus();
+
+	if (!ret)
+		ret = size;
+
+	return ret;
+}
+
+static ssize_t version_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t pf_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+	&dev_attr_version.attr,
+	&dev_attr_processor_flags.attr,
+	NULL
+};
+
+static struct attribute_group mc_attr_group = {
+	.attrs			= mc_default_attrs,
+	.name			= "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+	microcode_ops->microcode_fini_cpu(cpu);
+}
+
+static enum ucode_state microcode_resume_cpu(int cpu)
+{
+	pr_debug("CPU%d updated upon resume\n", cpu);
+
+	if (apply_microcode_on_target(cpu))
+		return UCODE_ERROR;
+
+	return UCODE_OK;
+}
+
+static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
+{
+	enum ucode_state ustate;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	if (uci && uci->valid)
+		return UCODE_OK;
+
+	if (collect_cpu_info(cpu))
+		return UCODE_ERROR;
+
+	/* --dimm. Trigger a delayed update? */
+	if (system_state != SYSTEM_RUNNING)
+		return UCODE_NFOUND;
+
+	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
+						     refresh_fw);
+
+	if (ustate == UCODE_OK) {
+		pr_debug("CPU%d updated upon init\n", cpu);
+		apply_microcode_on_target(cpu);
+	}
+
+	return ustate;
+}
+
+static enum ucode_state microcode_update_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	if (uci->valid)
+		return microcode_resume_cpu(cpu);
+
+	return microcode_init_cpu(cpu, false);
+}
+
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
+{
+	int err, cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	pr_debug("CPU%d added\n", cpu);
+
+	err = sysfs_create_group(&dev->kobj, &mc_attr_group);
+	if (err)
+		return err;
+
+	if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
+		return -EINVAL;
+
+	return err;
+}
+
+static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
+{
+	int cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	pr_debug("CPU%d removed\n", cpu);
+	microcode_fini_cpu(cpu);
+	sysfs_remove_group(&dev->kobj, &mc_attr_group);
+	return 0;
+}
+
+static struct subsys_interface mc_cpu_interface = {
+	.name			= "microcode",
+	.subsys			= &cpu_subsys,
+	.add_dev		= mc_device_add,
+	.remove_dev		= mc_device_remove,
+};
+
+/**
+ * mc_bp_resume - Update boot CPU microcode during resume.
+ */
+static void mc_bp_resume(void)
+{
+	int cpu = smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	if (uci->valid && uci->mc)
+		microcode_ops->apply_microcode(cpu);
+}
+
+static struct syscore_ops mc_syscore_ops = {
+	.resume			= mc_bp_resume,
+};
+
+static int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct device *dev;
+
+	dev = get_cpu_device(cpu);
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+		microcode_update_cpu(cpu);
+		pr_debug("CPU%d added\n", cpu);
+		/*
+		 * "break" is missing on purpose here because we want to fall
+		 * through in order to create the sysfs group.
+		 */
+
+	case CPU_DOWN_FAILED:
+		if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+			pr_err("Failed to create group for CPU%d\n", cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+		/* Suspend is in progress, only remove the interface */
+		sysfs_remove_group(&dev->kobj, &mc_attr_group);
+		pr_debug("CPU%d removed\n", cpu);
+		break;
+
+	/*
+	 * case CPU_DEAD:
+	 *
+	 * When a CPU goes offline, don't free up or invalidate the copy of
+	 * the microcode in kernel memory, so that we can reuse it when the
+	 * CPU comes back online without unnecessarily requesting the userspace
+	 * for it again.
+	 */
+	}
+
+	/* The CPU refused to come up during a system resume */
+	if (action == CPU_UP_CANCELED_FROZEN)
+		microcode_fini_cpu(cpu);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata mc_cpu_notifier = {
+	.notifier_call	= mc_cpu_callback,
+};
+
+#ifdef MODULE
+/* Autoload on Intel and AMD systems */
+static const struct x86_cpu_id __initconst microcode_id[] = {
+#ifdef CONFIG_MICROCODE_INTEL
+	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+#ifdef CONFIG_MICROCODE_AMD
+	{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, microcode_id);
+#endif
+
+static struct attribute *cpu_root_microcode_attrs[] = {
+	&dev_attr_reload.attr,
+	NULL
+};
+
+static struct attribute_group cpu_root_microcode_group = {
+	.name  = "microcode",
+	.attrs = cpu_root_microcode_attrs,
+};
+
+static int __init microcode_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	int error;
+
+	if (c->x86_vendor == X86_VENDOR_INTEL)
+		microcode_ops = init_intel_microcode();
+	else if (c->x86_vendor == X86_VENDOR_AMD)
+		microcode_ops = init_amd_microcode();
+	else
+		pr_err("no support for this CPU vendor\n");
+
+	if (!microcode_ops)
+		return -ENODEV;
+
+	microcode_pdev = platform_device_register_simple("microcode", -1,
+							 NULL, 0);
+	if (IS_ERR(microcode_pdev))
+		return PTR_ERR(microcode_pdev);
+
+	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
+	error = subsys_interface_register(&mc_cpu_interface);
+	if (!error)
+		perf_check_microcode();
+	mutex_unlock(&microcode_mutex);
+	put_online_cpus();
+
+	if (error)
+		goto out_pdev;
+
+	error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+				   &cpu_root_microcode_group);
+
+	if (error) {
+		pr_err("Error creating microcode group!\n");
+		goto out_driver;
+	}
+
+	error = microcode_dev_init();
+	if (error)
+		goto out_ucode_group;
+
+	register_syscore_ops(&mc_syscore_ops);
+	register_hotcpu_notifier(&mc_cpu_notifier);
+
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
+		" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+
+	return 0;
+
+ out_ucode_group:
+	sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+			   &cpu_root_microcode_group);
+
+ out_driver:
+	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
+	subsys_interface_unregister(&mc_cpu_interface);
+
+	mutex_unlock(&microcode_mutex);
+	put_online_cpus();
+
+ out_pdev:
+	platform_device_unregister(microcode_pdev);
+	return error;
+
+}
+module_init(microcode_init);
+
+static void __exit microcode_exit(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	microcode_dev_exit();
+
+	unregister_hotcpu_notifier(&mc_cpu_notifier);
+	unregister_syscore_ops(&mc_syscore_ops);
+
+	sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+			   &cpu_root_microcode_group);
+
+	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
+	subsys_interface_unregister(&mc_cpu_interface);
+
+	mutex_unlock(&microcode_mutex);
+	put_online_cpus();
+
+	platform_device_unregister(microcode_pdev);
+
+	microcode_ops = NULL;
+
+	if (c->x86_vendor == X86_VENDOR_AMD)
+		exit_amd_microcode();
+
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+}
+module_exit(microcode_exit);
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
new file mode 100644
index 0000000..be7f851
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -0,0 +1,141 @@
+/*
+ *	X86 CPU microcode early update for Linux
+ *
+ *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *			   H Peter Anvin" <hpa@zytor.com>
+ *
+ *	This driver allows to early upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ *	Software Developer's Manual.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <asm/microcode_intel.h>
+#include <asm/microcode_amd.h>
+#include <asm/processor.h>
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx)	\
+		(!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_vendor() to get vendor id for AP.
+ *
+ * x86_vendor() gets vendor information directly through cpuid.
+ */
+static int x86_vendor(void)
+{
+	u32 eax = 0x00000000;
+	u32 ebx, ecx = 0, edx;
+
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+		return X86_VENDOR_INTEL;
+
+	if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+		return X86_VENDOR_AMD;
+
+	return X86_VENDOR_UNKNOWN;
+}
+
+static int x86_family(void)
+{
+	u32 eax = 0x00000001;
+	u32 ebx, ecx = 0, edx;
+	int x86;
+
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	x86 = (eax >> 8) & 0xf;
+	if (x86 == 15)
+		x86 += (eax >> 20) & 0xff;
+
+	return x86;
+}
+
+void __init load_ucode_bsp(void)
+{
+	int vendor, x86;
+
+	if (!have_cpuid_p())
+		return;
+
+	vendor = x86_vendor();
+	x86 = x86_family();
+
+	switch (vendor) {
+	case X86_VENDOR_INTEL:
+		if (x86 >= 6)
+			load_ucode_intel_bsp();
+		break;
+	case X86_VENDOR_AMD:
+		if (x86 >= 0x10)
+			load_ucode_amd_bsp();
+		break;
+	default:
+		break;
+	}
+}
+
+void load_ucode_ap(void)
+{
+	int vendor, x86;
+
+	if (!have_cpuid_p())
+		return;
+
+	vendor = x86_vendor();
+	x86 = x86_family();
+
+	switch (vendor) {
+	case X86_VENDOR_INTEL:
+		if (x86 >= 6)
+			load_ucode_intel_ap();
+		break;
+	case X86_VENDOR_AMD:
+		if (x86 >= 0x10)
+			load_ucode_amd_ap();
+		break;
+	default:
+		break;
+	}
+}
+
+int __init save_microcode_in_initrd(void)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		if (c->x86 >= 6)
+			save_microcode_in_initrd_intel();
+		break;
+	case X86_VENDOR_AMD:
+		if (c->x86 >= 0x10)
+			save_microcode_in_initrd_amd();
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
new file mode 100644
index 0000000..5fb2ceb
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -0,0 +1,333 @@
+/*
+ *	Intel CPU Microcode Update Driver for Linux
+ *
+ *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
+ *
+ *	This driver allows to upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
+ *
+ *	For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Initial release.
+ *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added read() support + cleanups.
+ *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added 'device trimming' support. open(O_WRONLY) zeroes
+ *		and frees the saved copy of applied microcode.
+ *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Added misc device support (now uses both devfs and misc).
+ *		Added MICROCODE_IOCFREE ioctl to clear memory.
+ *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Messages for error cases (non Intel & no suitable microcode).
+ *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->release(). Removed exclusive open and status bitmap.
+ *		Added microcode_rwsem to serialize read()/write()/ioctl().
+ *		Removed global kernel lock usage.
+ *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Write 0 to 0x8B msr and then cpuid before reading revision,
+ *		so that it works even if there were no update done by the
+ *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *		to be 0 on my machine which is why it worked even when I
+ *		disabled update by the BIOS)
+ *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *			     Tigran Aivazian <tigran@veritas.com>
+ *		Intel Pentium 4 processor support and bugfixes.
+ *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *		Bugfix for HT (Hyper-Threading) enabled processors
+ *		whereby processor resources are shared by all logical processors
+ *		in a single CPU package.
+ *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *		Tigran Aivazian <tigran@veritas.com>,
+ *		Serialize updates as required on HT processors due to
+ *		speculative nature of implementation.
+ *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *		Fix the panic when writing zero-length microcode chunk.
+ *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ *		Jun Nakajima <jun.nakajima@intel.com>
+ *		Support for the microcode updates in the new format.
+ *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *		because we no longer hold a copy of applied microcode
+ *		in kernel memory.
+ *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *		Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *		Thanks to Stuart Swales for pointing out this bug.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+	unsigned int val[2];
+
+	memset(csig, 0, sizeof(*csig));
+
+	csig->sig = cpuid_eax(0x00000001);
+
+	if ((c->x86_model >= 5) || (c->x86 > 6)) {
+		/* get processor flags from MSR 0x17 */
+		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+		csig->pf = 1 << ((val[1] >> 18) & 7);
+	}
+
+	csig->rev = c->microcode;
+	pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+		cpu_num, csig->sig, csig->pf, csig->rev);
+
+	return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
+{
+	struct cpu_signature cpu_sig;
+	unsigned int csig, cpf, crev;
+
+	collect_cpu_info(cpu, &cpu_sig);
+
+	csig = cpu_sig.sig;
+	cpf = cpu_sig.pf;
+	crev = cpu_sig.rev;
+
+	return get_matching_microcode(csig, cpf, mc_intel, crev);
+}
+
+int apply_microcode(int cpu)
+{
+	struct microcode_intel *mc_intel;
+	struct ucode_cpu_info *uci;
+	unsigned int val[2];
+	int cpu_num = raw_smp_processor_id();
+	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+
+	uci = ucode_cpu_info + cpu;
+	mc_intel = uci->mc;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu_num != cpu);
+
+	if (mc_intel == NULL)
+		return 0;
+
+	/*
+	 * Microcode on this CPU could be updated earlier. Only apply the
+	 * microcode patch in mc_intel when it is newer than the one on this
+	 * CPU.
+	 */
+	if (get_matching_mc(mc_intel, cpu) == 0)
+		return 0;
+
+	/* write microcode via MSR 0x79 */
+	wrmsr(MSR_IA32_UCODE_WRITE,
+	      (unsigned long) mc_intel->bits,
+	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+	/* As documented in the SDM: Do a CPUID 1 here */
+	sync_core();
+
+	/* get the current revision from MSR 0x8B */
+	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+	if (val[1] != mc_intel->hdr.rev) {
+		pr_err("CPU%d update to revision 0x%x failed\n",
+		       cpu_num, mc_intel->hdr.rev);
+		return -1;
+	}
+	pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
+		cpu_num, val[1],
+		mc_intel->hdr.date & 0xffff,
+		mc_intel->hdr.date >> 24,
+		(mc_intel->hdr.date >> 16) & 0xff);
+
+	uci->cpu_sig.rev = val[1];
+	c->microcode = val[1];
+
+	return 0;
+}
+
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+				int (*get_ucode_data)(void *, const void *, size_t))
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
+	int new_rev = uci->cpu_sig.rev;
+	unsigned int leftover = size;
+	enum ucode_state state = UCODE_OK;
+	unsigned int curr_mc_size = 0;
+	unsigned int csig, cpf;
+
+	while (leftover) {
+		struct microcode_header_intel mc_header;
+		unsigned int mc_size;
+
+		if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
+			break;
+
+		mc_size = get_totalsize(&mc_header);
+		if (!mc_size || mc_size > leftover) {
+			pr_err("error! Bad data in microcode data file\n");
+			break;
+		}
+
+		/* For performance reasons, reuse mc area when possible */
+		if (!mc || mc_size > curr_mc_size) {
+			vfree(mc);
+			mc = vmalloc(mc_size);
+			if (!mc)
+				break;
+			curr_mc_size = mc_size;
+		}
+
+		if (get_ucode_data(mc, ucode_ptr, mc_size) ||
+		    microcode_sanity_check(mc, 1) < 0) {
+			break;
+		}
+
+		csig = uci->cpu_sig.sig;
+		cpf = uci->cpu_sig.pf;
+		if (get_matching_microcode(csig, cpf, mc, new_rev)) {
+			vfree(new_mc);
+			new_rev = mc_header.rev;
+			new_mc  = mc;
+			mc = NULL;	/* trigger new vmalloc */
+		}
+
+		ucode_ptr += mc_size;
+		leftover  -= mc_size;
+	}
+
+	vfree(mc);
+
+	if (leftover) {
+		vfree(new_mc);
+		state = UCODE_ERROR;
+		goto out;
+	}
+
+	if (!new_mc) {
+		state = UCODE_NFOUND;
+		goto out;
+	}
+
+	vfree(uci->mc);
+	uci->mc = (struct microcode_intel *)new_mc;
+
+	/*
+	 * If early loading microcode is supported, save this mc into
+	 * permanent memory. So it will be loaded early when a CPU is hot added
+	 * or resumes.
+	 */
+	save_mc_for_early(new_mc);
+
+	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
+		 cpu, new_rev, uci->cpu_sig.rev);
+out:
+	return state;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+	memcpy(to, from, n);
+	return 0;
+}
+
+static enum ucode_state request_microcode_fw(int cpu, struct device *device,
+					     bool refresh_fw)
+{
+	char name[30];
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	const struct firmware *firmware;
+	enum ucode_state ret;
+
+	sprintf(name, "intel-ucode/%02x-%02x-%02x",
+		c->x86, c->x86_model, c->x86_mask);
+
+	if (request_firmware(&firmware, name, device)) {
+		pr_debug("data file %s load failed\n", name);
+		return UCODE_NFOUND;
+	}
+
+	ret = generic_load_microcode(cpu, (void *)firmware->data,
+				     firmware->size, &get_ucode_fw);
+
+	release_firmware(firmware);
+
+	return ret;
+}
+
+static int get_ucode_user(void *to, const void *from, size_t n)
+{
+	return copy_from_user(to, from, n);
+}
+
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	vfree(uci->mc);
+	uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_intel_ops = {
+	.request_microcode_user		  = request_microcode_user,
+	.request_microcode_fw             = request_microcode_fw,
+	.collect_cpu_info                 = collect_cpu_info,
+	.apply_microcode                  = apply_microcode,
+	.microcode_fini_cpu               = microcode_fini_cpu,
+};
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+	    cpu_has(c, X86_FEATURE_IA64)) {
+		pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+		return NULL;
+	}
+
+	return &microcode_intel_ops;
+}
+
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
new file mode 100644
index 0000000..18f7391
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -0,0 +1,787 @@
+/*
+ *	Intel CPU microcode early update for Linux
+ *
+ *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *			   H Peter Anvin" <hpa@zytor.com>
+ *
+ *	This allows to early upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ *	Software Developer's Manual.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+#include <linux/cpu.h>
+#include <asm/msr.h>
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+struct mc_saved_data {
+	unsigned int mc_saved_count;
+	struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state
+generic_load_microcode_early(struct microcode_intel **mc_saved_p,
+			     unsigned int mc_saved_count,
+			     struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *ucode_ptr, *new_mc = NULL;
+	int new_rev = uci->cpu_sig.rev;
+	enum ucode_state state = UCODE_OK;
+	unsigned int mc_size;
+	struct microcode_header_intel *mc_header;
+	unsigned int csig = uci->cpu_sig.sig;
+	unsigned int cpf = uci->cpu_sig.pf;
+	int i;
+
+	for (i = 0; i < mc_saved_count; i++) {
+		ucode_ptr = mc_saved_p[i];
+
+		mc_header = (struct microcode_header_intel *)ucode_ptr;
+		mc_size = get_totalsize(mc_header);
+		if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
+			new_rev = mc_header->rev;
+			new_mc  = ucode_ptr;
+		}
+	}
+
+	if (!new_mc) {
+		state = UCODE_NFOUND;
+		goto out;
+	}
+
+	uci->mc = (struct microcode_intel *)new_mc;
+out:
+	return state;
+}
+
+static void
+microcode_pointer(struct microcode_intel **mc_saved,
+		  unsigned long *mc_saved_in_initrd,
+		  unsigned long initrd_start, int mc_saved_count)
+{
+	int i;
+
+	for (i = 0; i < mc_saved_count; i++)
+		mc_saved[i] = (struct microcode_intel *)
+			      (mc_saved_in_initrd[i] + initrd_start);
+}
+
+#ifdef CONFIG_X86_32
+static void
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+	       struct mc_saved_data *mc_saved_data)
+{
+	int i;
+	struct microcode_intel ***mc_saved;
+
+	mc_saved = (struct microcode_intel ***)
+		   __pa_nodebug(&mc_saved_data->mc_saved);
+	for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+		struct microcode_intel *p;
+
+		p = *(struct microcode_intel **)
+			__pa_nodebug(mc_saved_data->mc_saved + i);
+		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+	}
+}
+#endif
+
+static enum ucode_state
+load_microcode(struct mc_saved_data *mc_saved_data,
+	       unsigned long *mc_saved_in_initrd,
+	       unsigned long initrd_start,
+	       struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	unsigned int count = mc_saved_data->mc_saved_count;
+
+	if (!mc_saved_data->mc_saved) {
+		microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
+				  initrd_start, count);
+
+		return generic_load_microcode_early(mc_saved_tmp, count, uci);
+	} else {
+#ifdef CONFIG_X86_32
+		microcode_phys(mc_saved_tmp, mc_saved_data);
+		return generic_load_microcode_early(mc_saved_tmp, count, uci);
+#else
+		return generic_load_microcode_early(mc_saved_data->mc_saved,
+						    count, uci);
+#endif
+	}
+}
+
+static u8 get_x86_family(unsigned long sig)
+{
+	u8 x86;
+
+	x86 = (sig >> 8) & 0xf;
+
+	if (x86 == 0xf)
+		x86 += (sig >> 20) & 0xff;
+
+	return x86;
+}
+
+static u8 get_x86_model(unsigned long sig)
+{
+	u8 x86, x86_model;
+
+	x86 = get_x86_family(sig);
+	x86_model = (sig >> 4) & 0xf;
+
+	if (x86 == 0x6 || x86 == 0xf)
+		x86_model += ((sig >> 16) & 0xf) << 4;
+
+	return x86_model;
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ */
+static enum ucode_state
+matching_model_microcode(struct microcode_header_intel *mc_header,
+			unsigned long sig)
+{
+	u8 x86, x86_model;
+	u8 x86_ucode, x86_model_ucode;
+	struct extended_sigtable *ext_header;
+	unsigned long total_size = get_totalsize(mc_header);
+	unsigned long data_size = get_datasize(mc_header);
+	int ext_sigcount, i;
+	struct extended_signature *ext_sig;
+
+	x86 = get_x86_family(sig);
+	x86_model = get_x86_model(sig);
+
+	x86_ucode = get_x86_family(mc_header->sig);
+	x86_model_ucode = get_x86_model(mc_header->sig);
+
+	if (x86 == x86_ucode && x86_model == x86_model_ucode)
+		return UCODE_OK;
+
+	/* Look for ext. headers: */
+	if (total_size <= data_size + MC_HEADER_SIZE)
+		return UCODE_NFOUND;
+
+	ext_header = (struct extended_sigtable *)
+		     mc_header + data_size + MC_HEADER_SIZE;
+	ext_sigcount = ext_header->count;
+	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+	for (i = 0; i < ext_sigcount; i++) {
+		x86_ucode = get_x86_family(ext_sig->sig);
+		x86_model_ucode = get_x86_model(ext_sig->sig);
+
+		if (x86 == x86_ucode && x86_model == x86_model_ucode)
+			return UCODE_OK;
+
+		ext_sig++;
+	}
+
+	return UCODE_NFOUND;
+}
+
+static int
+save_microcode(struct mc_saved_data *mc_saved_data,
+	       struct microcode_intel **mc_saved_src,
+	       unsigned int mc_saved_count)
+{
+	int i, j;
+	struct microcode_intel **mc_saved_p;
+	int ret;
+
+	if (!mc_saved_count)
+		return -EINVAL;
+
+	/*
+	 * Copy new microcode data.
+	 */
+	mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
+			     GFP_KERNEL);
+	if (!mc_saved_p)
+		return -ENOMEM;
+
+	for (i = 0; i < mc_saved_count; i++) {
+		struct microcode_intel *mc = mc_saved_src[i];
+		struct microcode_header_intel *mc_header = &mc->hdr;
+		unsigned long mc_size = get_totalsize(mc_header);
+		mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
+		if (!mc_saved_p[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		if (!mc_saved_src[i]) {
+			ret = -EINVAL;
+			goto err;
+		}
+		memcpy(mc_saved_p[i], mc, mc_size);
+	}
+
+	/*
+	 * Point to newly saved microcode.
+	 */
+	mc_saved_data->mc_saved = mc_saved_p;
+	mc_saved_data->mc_saved_count = mc_saved_count;
+
+	return 0;
+
+err:
+	for (j = 0; j <= i; j++)
+		kfree(mc_saved_p[j]);
+	kfree(mc_saved_p);
+
+	return ret;
+}
+
+/*
+ * A microcode patch in ucode_ptr is saved into mc_saved
+ * - if it has matching signature and newer revision compared to an existing
+ *   patch mc_saved.
+ * - or if it is a newly discovered microcode patch.
+ *
+ * The microcode patch should have matching model with CPU.
+ */
+static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
+		     unsigned int *mc_saved_count_p)
+{
+	int i;
+	int found = 0;
+	unsigned int mc_saved_count = *mc_saved_count_p;
+	struct microcode_header_intel *mc_header;
+
+	mc_header = (struct microcode_header_intel *)ucode_ptr;
+	for (i = 0; i < mc_saved_count; i++) {
+		unsigned int sig, pf;
+		unsigned int new_rev;
+		struct microcode_header_intel *mc_saved_header =
+			     (struct microcode_header_intel *)mc_saved[i];
+		sig = mc_saved_header->sig;
+		pf = mc_saved_header->pf;
+		new_rev = mc_header->rev;
+
+		if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
+			found = 1;
+			if (update_match_revision(mc_header, new_rev)) {
+				/*
+				 * Found an older ucode saved before.
+				 * Replace the older one with this newer
+				 * one.
+				 */
+				mc_saved[i] =
+					(struct microcode_intel *)ucode_ptr;
+				break;
+			}
+		}
+	}
+	if (i >= mc_saved_count && !found)
+		/*
+		 * This ucode is first time discovered in ucode file.
+		 * Save it to memory.
+		 */
+		mc_saved[mc_saved_count++] =
+				 (struct microcode_intel *)ucode_ptr;
+
+	*mc_saved_count_p = mc_saved_count;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static enum ucode_state __init
+get_matching_model_microcode(int cpu, unsigned long start,
+			     void *data, size_t size,
+			     struct mc_saved_data *mc_saved_data,
+			     unsigned long *mc_saved_in_initrd,
+			     struct ucode_cpu_info *uci)
+{
+	u8 *ucode_ptr = data;
+	unsigned int leftover = size;
+	enum ucode_state state = UCODE_OK;
+	unsigned int mc_size;
+	struct microcode_header_intel *mc_header;
+	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+	int i;
+
+	while (leftover) {
+		mc_header = (struct microcode_header_intel *)ucode_ptr;
+
+		mc_size = get_totalsize(mc_header);
+		if (!mc_size || mc_size > leftover ||
+			microcode_sanity_check(ucode_ptr, 0) < 0)
+			break;
+
+		leftover -= mc_size;
+
+		/*
+		 * Since APs with same family and model as the BSP may boot in
+		 * the platform, we need to find and save microcode patches
+		 * with the same family and model as the BSP.
+		 */
+		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
+			 UCODE_OK) {
+			ucode_ptr += mc_size;
+			continue;
+		}
+
+		_save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+
+		ucode_ptr += mc_size;
+	}
+
+	if (leftover) {
+		state = UCODE_ERROR;
+		goto out;
+	}
+
+	if (mc_saved_count == 0) {
+		state = UCODE_NFOUND;
+		goto out;
+	}
+
+	for (i = 0; i < mc_saved_count; i++)
+		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+	mc_saved_data->mc_saved_count = mc_saved_count;
+out:
+	return state;
+}
+
+static int collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+	unsigned int val[2];
+	u8 x86, x86_model;
+	struct cpu_signature csig;
+	unsigned int eax, ebx, ecx, edx;
+
+	csig.sig = 0;
+	csig.pf = 0;
+	csig.rev = 0;
+
+	memset(uci, 0, sizeof(*uci));
+
+	eax = 0x00000001;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	csig.sig = eax;
+
+	x86 = get_x86_family(csig.sig);
+	x86_model = get_x86_model(csig.sig);
+
+	if ((x86_model >= 5) || (x86 > 6)) {
+		/* get processor flags from MSR 0x17 */
+		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+		csig.pf = 1 << ((val[1] >> 18) & 7);
+	}
+	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+	/* As documented in the SDM: Do a CPUID 1 here */
+	sync_core();
+
+	/* get the current revision from MSR 0x8B */
+	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+	csig.rev = val[1];
+
+	uci->cpu_sig = csig;
+	uci->valid = 1;
+
+	return 0;
+}
+
+#ifdef DEBUG
+static void __ref show_saved_mc(void)
+{
+	int i, j;
+	unsigned int sig, pf, rev, total_size, data_size, date;
+	struct ucode_cpu_info uci;
+
+	if (mc_saved_data.mc_saved_count == 0) {
+		pr_debug("no micorcode data saved.\n");
+		return;
+	}
+	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+
+	collect_cpu_info_early(&uci);
+
+	sig = uci.cpu_sig.sig;
+	pf = uci.cpu_sig.pf;
+	rev = uci.cpu_sig.rev;
+	pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
+		 smp_processor_id(), sig, pf, rev);
+
+	for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+		struct microcode_header_intel *mc_saved_header;
+		struct extended_sigtable *ext_header;
+		int ext_sigcount;
+		struct extended_signature *ext_sig;
+
+		mc_saved_header = (struct microcode_header_intel *)
+				  mc_saved_data.mc_saved[i];
+		sig = mc_saved_header->sig;
+		pf = mc_saved_header->pf;
+		rev = mc_saved_header->rev;
+		total_size = get_totalsize(mc_saved_header);
+		data_size = get_datasize(mc_saved_header);
+		date = mc_saved_header->date;
+
+		pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
+			 i, sig, pf, rev, total_size,
+			 date & 0xffff,
+			 date >> 24,
+			 (date >> 16) & 0xff);
+
+		/* Look for ext. headers: */
+		if (total_size <= data_size + MC_HEADER_SIZE)
+			continue;
+
+		ext_header = (struct extended_sigtable *)
+			     mc_saved_header + data_size + MC_HEADER_SIZE;
+		ext_sigcount = ext_header->count;
+		ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+		for (j = 0; j < ext_sigcount; j++) {
+			sig = ext_sig->sig;
+			pf = ext_sig->pf;
+
+			pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+				 j, sig, pf);
+
+			ext_sig++;
+		}
+
+	}
+}
+#else
+static inline void show_saved_mc(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+/*
+ * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
+ * hot added or resumes.
+ *
+ * Please make sure this mc should be a valid microcode patch before calling
+ * this function.
+ */
+int save_mc_for_early(u8 *mc)
+{
+	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	unsigned int mc_saved_count_init;
+	unsigned int mc_saved_count;
+	struct microcode_intel **mc_saved;
+	int ret = 0;
+	int i;
+
+	/*
+	 * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
+	 * hotplug.
+	 */
+	mutex_lock(&x86_cpu_microcode_mutex);
+
+	mc_saved_count_init = mc_saved_data.mc_saved_count;
+	mc_saved_count = mc_saved_data.mc_saved_count;
+	mc_saved = mc_saved_data.mc_saved;
+
+	if (mc_saved && mc_saved_count)
+		memcpy(mc_saved_tmp, mc_saved,
+		       mc_saved_count * sizeof(struct mirocode_intel *));
+	/*
+	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
+	 * version.
+	 */
+
+	_save_mc(mc_saved_tmp, mc, &mc_saved_count);
+
+	/*
+	 * Save the mc_save_tmp in global mc_saved_data.
+	 */
+	ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+	if (ret) {
+		pr_err("Cannot save microcode patch.\n");
+		goto out;
+	}
+
+	show_saved_mc();
+
+	/*
+	 * Free old saved microcod data.
+	 */
+	if (mc_saved) {
+		for (i = 0; i < mc_saved_count_init; i++)
+			kfree(mc_saved[i]);
+		kfree(mc_saved);
+	}
+
+out:
+	mutex_unlock(&x86_cpu_microcode_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(save_mc_for_early);
+#endif
+
+static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+static __init enum ucode_state
+scan_microcode(unsigned long start, unsigned long end,
+		struct mc_saved_data *mc_saved_data,
+		unsigned long *mc_saved_in_initrd,
+		struct ucode_cpu_info *uci)
+{
+	unsigned int size = end - start + 1;
+	struct cpio_data cd;
+	long offset = 0;
+#ifdef CONFIG_X86_32
+	char *p = (char *)__pa_nodebug(ucode_name);
+#else
+	char *p = ucode_name;
+#endif
+
+	cd.data = NULL;
+	cd.size = 0;
+
+	cd = find_cpio_data(p, (void *)start, size, &offset);
+	if (!cd.data)
+		return UCODE_ERROR;
+
+
+	return get_matching_model_microcode(0, start, cd.data, cd.size,
+					    mc_saved_data, mc_saved_in_initrd,
+					    uci);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+	int cpu = smp_processor_id();
+
+	pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+		cpu,
+		uci->cpu_sig.rev,
+		date & 0xffff,
+		date >> 24,
+		(date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void show_ucode_info_early(void)
+{
+	struct ucode_cpu_info uci;
+
+	if (delay_ucode_info) {
+		collect_cpu_info_early(&uci);
+		print_ucode_info(&uci, current_mc_date);
+		delay_ucode_info = 0;
+	}
+}
+
+/*
+ * At this point, we can not call printk() yet. Keep microcode patch number in
+ * mc_saved_data.mc_saved and delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void print_ucode(struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *mc_intel;
+	int *delay_ucode_info_p;
+	int *current_mc_date_p;
+
+	mc_intel = uci->mc;
+	if (mc_intel == NULL)
+		return;
+
+	delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
+	current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+
+	*delay_ucode_info_p = 1;
+	*current_mc_date_p = mc_intel->hdr.date;
+}
+#else
+
+/*
+ * Flush global tlb. We only do this in x86_64 where paging has been enabled
+ * already and PGE should be enabled as well.
+ */
+static inline void flush_tlb_early(void)
+{
+	__native_flush_tlb_global_irq_disabled();
+}
+
+static inline void print_ucode(struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *mc_intel;
+
+	mc_intel = uci->mc;
+	if (mc_intel == NULL)
+		return;
+
+	print_ucode_info(uci, mc_intel->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
+				 struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *mc_intel;
+	unsigned int val[2];
+
+	mc_intel = uci->mc;
+	if (mc_intel == NULL)
+		return 0;
+
+	/* write microcode via MSR 0x79 */
+	native_wrmsr(MSR_IA32_UCODE_WRITE,
+	      (unsigned long) mc_intel->bits,
+	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+	/* As documented in the SDM: Do a CPUID 1 here */
+	sync_core();
+
+	/* get the current revision from MSR 0x8B */
+	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+	if (val[1] != mc_intel->hdr.rev)
+		return -1;
+
+#ifdef CONFIG_X86_64
+	/* Flush global tlb. This is precaution. */
+	flush_tlb_early();
+#endif
+	uci->cpu_sig.rev = val[1];
+
+	print_ucode(uci);
+
+	return 0;
+}
+
+/*
+ * This function converts microcode patch offsets previously stored in
+ * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ */
+int __init save_microcode_in_initrd_intel(void)
+{
+	unsigned int count = mc_saved_data.mc_saved_count;
+	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
+	int ret = 0;
+
+	if (count == 0)
+		return ret;
+
+	microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+	ret = save_microcode(&mc_saved_data, mc_saved, count);
+	if (ret)
+		pr_err("Cannot save microcode patches from initrd.\n");
+
+	show_saved_mc();
+
+	return ret;
+}
+
+static void __init
+_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
+		      unsigned long *mc_saved_in_initrd,
+		      unsigned long initrd_start_early,
+		      unsigned long initrd_end_early,
+		      struct ucode_cpu_info *uci)
+{
+	collect_cpu_info_early(uci);
+	scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
+		       mc_saved_in_initrd, uci);
+	load_microcode(mc_saved_data, mc_saved_in_initrd,
+		       initrd_start_early, uci);
+	apply_microcode_early(mc_saved_data, uci);
+}
+
+void __init
+load_ucode_intel_bsp(void)
+{
+	u64 ramdisk_image, ramdisk_size;
+	unsigned long initrd_start_early, initrd_end_early;
+	struct ucode_cpu_info uci;
+#ifdef CONFIG_X86_32
+	struct boot_params *boot_params_p;
+
+	boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
+	ramdisk_image = boot_params_p->hdr.ramdisk_image;
+	ramdisk_size  = boot_params_p->hdr.ramdisk_size;
+	initrd_start_early = ramdisk_image;
+	initrd_end_early = initrd_start_early + ramdisk_size;
+
+	_load_ucode_intel_bsp(
+		(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+		(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+		initrd_start_early, initrd_end_early, &uci);
+#else
+	ramdisk_image = boot_params.hdr.ramdisk_image;
+	ramdisk_size  = boot_params.hdr.ramdisk_size;
+	initrd_start_early = ramdisk_image + PAGE_OFFSET;
+	initrd_end_early = initrd_start_early + ramdisk_size;
+
+	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
+			      initrd_start_early, initrd_end_early, &uci);
+#endif
+}
+
+void load_ucode_intel_ap(void)
+{
+	struct mc_saved_data *mc_saved_data_p;
+	struct ucode_cpu_info uci;
+	unsigned long *mc_saved_in_initrd_p;
+	unsigned long initrd_start_addr;
+#ifdef CONFIG_X86_32
+	unsigned long *initrd_start_p;
+
+	mc_saved_in_initrd_p =
+		(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+	initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+	initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+	mc_saved_data_p = &mc_saved_data;
+	mc_saved_in_initrd_p = mc_saved_in_initrd;
+	initrd_start_addr = initrd_start;
+#endif
+
+	/*
+	 * If there is no valid ucode previously saved in memory, no need to
+	 * update ucode on this AP.
+	 */
+	if (mc_saved_data_p->mc_saved_count == 0)
+		return;
+
+	collect_cpu_info_early(&uci);
+	load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+		       initrd_start_addr, &uci);
+	apply_microcode_early(mc_saved_data_p, &uci);
+}
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
new file mode 100644
index 0000000..ce69320
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -0,0 +1,174 @@
+/*
+ *	Intel CPU Microcode Update Driver for Linux
+ *
+ *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *			   H Peter Anvin" <hpa@zytor.com>
+ *
+ *	This driver allows to upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/Assets/PDF/manual/253668.pdf
+ *
+ *	For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+
+static inline int
+update_match_cpu(unsigned int csig, unsigned int cpf,
+		 unsigned int sig, unsigned int pf)
+{
+	return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+}
+
+int
+update_match_revision(struct microcode_header_intel *mc_header, int rev)
+{
+	return (mc_header->rev <= rev) ? 0 : 1;
+}
+
+int microcode_sanity_check(void *mc, int print_err)
+{
+	unsigned long total_size, data_size, ext_table_size;
+	struct microcode_header_intel *mc_header = mc;
+	struct extended_sigtable *ext_header = NULL;
+	int sum, orig_sum, ext_sigcount = 0, i;
+	struct extended_signature *ext_sig;
+
+	total_size = get_totalsize(mc_header);
+	data_size = get_datasize(mc_header);
+
+	if (data_size + MC_HEADER_SIZE > total_size) {
+		if (print_err)
+			pr_err("error! Bad data size in microcode data file\n");
+		return -EINVAL;
+	}
+
+	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+		if (print_err)
+			pr_err("error! Unknown microcode update format\n");
+		return -EINVAL;
+	}
+	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+	if (ext_table_size) {
+		if ((ext_table_size < EXT_HEADER_SIZE)
+		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+			if (print_err)
+				pr_err("error! Small exttable size in microcode data file\n");
+			return -EINVAL;
+		}
+		ext_header = mc + MC_HEADER_SIZE + data_size;
+		if (ext_table_size != exttable_size(ext_header)) {
+			if (print_err)
+				pr_err("error! Bad exttable size in microcode data file\n");
+			return -EFAULT;
+		}
+		ext_sigcount = ext_header->count;
+	}
+
+	/* check extended table checksum */
+	if (ext_table_size) {
+		int ext_table_sum = 0;
+		int *ext_tablep = (int *)ext_header;
+
+		i = ext_table_size / DWSIZE;
+		while (i--)
+			ext_table_sum += ext_tablep[i];
+		if (ext_table_sum) {
+			if (print_err)
+				pr_warn("aborting, bad extended signature table checksum\n");
+			return -EINVAL;
+		}
+	}
+
+	/* calculate the checksum */
+	orig_sum = 0;
+	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+	while (i--)
+		orig_sum += ((int *)mc)[i];
+	if (orig_sum) {
+		if (print_err)
+			pr_err("aborting, bad checksum\n");
+		return -EINVAL;
+	}
+	if (!ext_table_size)
+		return 0;
+	/* check extended signature checksum */
+	for (i = 0; i < ext_sigcount; i++) {
+		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+			  EXT_SIGNATURE_SIZE * i;
+		sum = orig_sum
+			- (mc_header->sig + mc_header->pf + mc_header->cksum)
+			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+		if (sum) {
+			if (print_err)
+				pr_err("aborting, bad checksum\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(microcode_sanity_check);
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+{
+	struct microcode_header_intel *mc_header = mc;
+	struct extended_sigtable *ext_header;
+	unsigned long total_size = get_totalsize(mc_header);
+	int ext_sigcount, i;
+	struct extended_signature *ext_sig;
+
+	if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+		return 1;
+
+	/* Look for ext. headers: */
+	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+		return 0;
+
+	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+	ext_sigcount = ext_header->count;
+	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+	for (i = 0; i < ext_sigcount; i++) {
+		if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+			return 1;
+		ext_sig++;
+	}
+	return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+{
+	struct microcode_header_intel *mc_header = mc;
+
+	if (!update_match_revision(mc_header, rev))
+		return 0;
+
+	return get_matching_sig(csig, cpf, mc, rev);
+}
+EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
deleted file mode 100644
index 4a6ff74..0000000
--- a/arch/x86/kernel/microcode_amd.c
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
- *  AMD CPU Microcode Update Driver for Linux
- *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.
- *
- *  Author: Peter Oruba <peter.oruba@amd.com>
- *
- *  Based on work by:
- *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *
- *  Maintainers:
- *  Andreas Herrmann <herrmann.der.user@googlemail.com>
- *  Borislav Petkov <bp@alien8.de>
- *
- *  This driver allows to upgrade microcode on F10h AMD
- *  CPUs and later.
- *
- *  Licensed under the terms of the GNU General Public
- *  License version 2. See file COPYING for details.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/firmware.h>
-#include <linux/pci_ids.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/microcode_amd.h>
-
-MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba");
-MODULE_LICENSE("GPL v2");
-
-static struct equiv_cpu_entry *equiv_cpu_table;
-
-struct ucode_patch {
-	struct list_head plist;
-	void *data;
-	u32 patch_id;
-	u16 equiv_cpu;
-};
-
-static LIST_HEAD(pcache);
-
-static u16 __find_equiv_id(unsigned int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig);
-}
-
-static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
-{
-	int i = 0;
-
-	BUG_ON(!equiv_cpu_table);
-
-	while (equiv_cpu_table[i].equiv_cpu != 0) {
-		if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
-			return equiv_cpu_table[i].installed_cpu;
-		i++;
-	}
-	return 0;
-}
-
-/*
- * a small, trivial cache of per-family ucode patches
- */
-static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
-{
-	struct ucode_patch *p;
-
-	list_for_each_entry(p, &pcache, plist)
-		if (p->equiv_cpu == equiv_cpu)
-			return p;
-	return NULL;
-}
-
-static void update_cache(struct ucode_patch *new_patch)
-{
-	struct ucode_patch *p;
-
-	list_for_each_entry(p, &pcache, plist) {
-		if (p->equiv_cpu == new_patch->equiv_cpu) {
-			if (p->patch_id >= new_patch->patch_id)
-				/* we already have the latest patch */
-				return;
-
-			list_replace(&p->plist, &new_patch->plist);
-			kfree(p->data);
-			kfree(p);
-			return;
-		}
-	}
-	/* no patch found, add it */
-	list_add_tail(&new_patch->plist, &pcache);
-}
-
-static void free_cache(void)
-{
-	struct ucode_patch *p, *tmp;
-
-	list_for_each_entry_safe(p, tmp, &pcache, plist) {
-		__list_del(p->plist.prev, p->plist.next);
-		kfree(p->data);
-		kfree(p);
-	}
-}
-
-static struct ucode_patch *find_patch(unsigned int cpu)
-{
-	u16 equiv_id;
-
-	equiv_id = __find_equiv_id(cpu);
-	if (!equiv_id)
-		return NULL;
-
-	return cache_find_patch(equiv_id);
-}
-
-static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
-{
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	struct ucode_patch *p;
-
-	csig->sig = cpuid_eax(0x00000001);
-	csig->rev = c->microcode;
-
-	/*
-	 * a patch could have been loaded early, set uci->mc so that
-	 * mc_bp_resume() can call apply_microcode()
-	 */
-	p = find_patch(cpu);
-	if (p && (p->patch_id == csig->rev))
-		uci->mc = p->data;
-
-	pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
-
-	return 0;
-}
-
-static unsigned int verify_patch_size(u8 family, u32 patch_size,
-				      unsigned int size)
-{
-	u32 max_size;
-
-#define F1XH_MPB_MAX_SIZE 2048
-#define F14H_MPB_MAX_SIZE 1824
-#define F15H_MPB_MAX_SIZE 4096
-#define F16H_MPB_MAX_SIZE 3458
-
-	switch (family) {
-	case 0x14:
-		max_size = F14H_MPB_MAX_SIZE;
-		break;
-	case 0x15:
-		max_size = F15H_MPB_MAX_SIZE;
-		break;
-	case 0x16:
-		max_size = F16H_MPB_MAX_SIZE;
-		break;
-	default:
-		max_size = F1XH_MPB_MAX_SIZE;
-		break;
-	}
-
-	if (patch_size > min_t(u32, size, max_size)) {
-		pr_err("patch size mismatch\n");
-		return 0;
-	}
-
-	return patch_size;
-}
-
-int __apply_microcode_amd(struct microcode_amd *mc_amd)
-{
-	u32 rev, dummy;
-
-	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
-
-	/* verify patch application was successful */
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-	if (rev != mc_amd->hdr.patch_id)
-		return -1;
-
-	return 0;
-}
-
-int apply_microcode_amd(int cpu)
-{
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	struct microcode_amd *mc_amd;
-	struct ucode_cpu_info *uci;
-	struct ucode_patch *p;
-	u32 rev, dummy;
-
-	BUG_ON(raw_smp_processor_id() != cpu);
-
-	uci = ucode_cpu_info + cpu;
-
-	p = find_patch(cpu);
-	if (!p)
-		return 0;
-
-	mc_amd  = p->data;
-	uci->mc = p->data;
-
-	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
-	/* need to apply patch? */
-	if (rev >= mc_amd->hdr.patch_id) {
-		c->microcode = rev;
-		uci->cpu_sig.rev = rev;
-		return 0;
-	}
-
-	if (__apply_microcode_amd(mc_amd)) {
-		pr_err("CPU%d: update failed for patch_level=0x%08x\n",
-			cpu, mc_amd->hdr.patch_id);
-		return -1;
-	}
-	pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
-		mc_amd->hdr.patch_id);
-
-	uci->cpu_sig.rev = mc_amd->hdr.patch_id;
-	c->microcode = mc_amd->hdr.patch_id;
-
-	return 0;
-}
-
-static int install_equiv_cpu_table(const u8 *buf)
-{
-	unsigned int *ibuf = (unsigned int *)buf;
-	unsigned int type = ibuf[1];
-	unsigned int size = ibuf[2];
-
-	if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
-		pr_err("empty section/"
-		       "invalid type field in container file section header\n");
-		return -EINVAL;
-	}
-
-	equiv_cpu_table = vmalloc(size);
-	if (!equiv_cpu_table) {
-		pr_err("failed to allocate equivalent CPU table\n");
-		return -ENOMEM;
-	}
-
-	memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
-
-	/* add header length */
-	return size + CONTAINER_HDR_SZ;
-}
-
-static void free_equiv_cpu_table(void)
-{
-	vfree(equiv_cpu_table);
-	equiv_cpu_table = NULL;
-}
-
-static void cleanup(void)
-{
-	free_equiv_cpu_table();
-	free_cache();
-}
-
-/*
- * We return the current size even if some of the checks failed so that
- * we can skip over the next patch. If we return a negative value, we
- * signal a grave error like a memory allocation has failed and the
- * driver cannot continue functioning normally. In such cases, we tear
- * down everything we've used up so far and exit.
- */
-static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
-{
-	struct microcode_header_amd *mc_hdr;
-	struct ucode_patch *patch;
-	unsigned int patch_size, crnt_size, ret;
-	u32 proc_fam;
-	u16 proc_id;
-
-	patch_size  = *(u32 *)(fw + 4);
-	crnt_size   = patch_size + SECTION_HDR_SIZE;
-	mc_hdr	    = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
-	proc_id	    = mc_hdr->processor_rev_id;
-
-	proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
-	if (!proc_fam) {
-		pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
-		return crnt_size;
-	}
-
-	/* check if patch is for the current family */
-	proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
-	if (proc_fam != family)
-		return crnt_size;
-
-	if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
-		pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
-			mc_hdr->patch_id);
-		return crnt_size;
-	}
-
-	ret = verify_patch_size(family, patch_size, leftover);
-	if (!ret) {
-		pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
-		return crnt_size;
-	}
-
-	patch = kzalloc(sizeof(*patch), GFP_KERNEL);
-	if (!patch) {
-		pr_err("Patch allocation failure.\n");
-		return -EINVAL;
-	}
-
-	patch->data = kzalloc(patch_size, GFP_KERNEL);
-	if (!patch->data) {
-		pr_err("Patch data allocation failure.\n");
-		kfree(patch);
-		return -EINVAL;
-	}
-
-	/* All looks ok, copy patch... */
-	memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
-	INIT_LIST_HEAD(&patch->plist);
-	patch->patch_id  = mc_hdr->patch_id;
-	patch->equiv_cpu = proc_id;
-
-	pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
-		 __func__, patch->patch_id, proc_id);
-
-	/* ... and add to cache. */
-	update_cache(patch);
-
-	return crnt_size;
-}
-
-static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
-					     size_t size)
-{
-	enum ucode_state ret = UCODE_ERROR;
-	unsigned int leftover;
-	u8 *fw = (u8 *)data;
-	int crnt_size = 0;
-	int offset;
-
-	offset = install_equiv_cpu_table(data);
-	if (offset < 0) {
-		pr_err("failed to create equivalent cpu table\n");
-		return ret;
-	}
-	fw += offset;
-	leftover = size - offset;
-
-	if (*(u32 *)fw != UCODE_UCODE_TYPE) {
-		pr_err("invalid type field in container file section header\n");
-		free_equiv_cpu_table();
-		return ret;
-	}
-
-	while (leftover) {
-		crnt_size = verify_and_add_patch(family, fw, leftover);
-		if (crnt_size < 0)
-			return ret;
-
-		fw	 += crnt_size;
-		leftover -= crnt_size;
-	}
-
-	return UCODE_OK;
-}
-
-enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
-{
-	enum ucode_state ret;
-
-	/* free old equiv table */
-	free_equiv_cpu_table();
-
-	ret = __load_microcode_amd(family, data, size);
-
-	if (ret != UCODE_OK)
-		cleanup();
-
-#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
-	/* save BSP's matching patch for early load */
-	if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
-		struct ucode_patch *p = find_patch(smp_processor_id());
-		if (p) {
-			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
-			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
-							       PATCH_MAX_SIZE));
-		}
-	}
-#endif
-	return ret;
-}
-
-/*
- * AMD microcode firmware naming convention, up to family 15h they are in
- * the legacy file:
- *
- *    amd-ucode/microcode_amd.bin
- *
- * This legacy file is always smaller than 2K in size.
- *
- * Beginning with family 15h, they are in family-specific firmware files:
- *
- *    amd-ucode/microcode_amd_fam15h.bin
- *    amd-ucode/microcode_amd_fam16h.bin
- *    ...
- *
- * These might be larger than 2K.
- */
-static enum ucode_state request_microcode_amd(int cpu, struct device *device,
-					      bool refresh_fw)
-{
-	char fw_name[36] = "amd-ucode/microcode_amd.bin";
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	enum ucode_state ret = UCODE_NFOUND;
-	const struct firmware *fw;
-
-	/* reload ucode container only on the boot cpu */
-	if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
-		return UCODE_OK;
-
-	if (c->x86 >= 0x15)
-		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
-
-	if (request_firmware(&fw, (const char *)fw_name, device)) {
-		pr_debug("failed to load file %s\n", fw_name);
-		goto out;
-	}
-
-	ret = UCODE_ERROR;
-	if (*(u32 *)fw->data != UCODE_MAGIC) {
-		pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
-		goto fw_release;
-	}
-
-	ret = load_microcode_amd(c->x86, fw->data, fw->size);
-
- fw_release:
-	release_firmware(fw);
-
- out:
-	return ret;
-}
-
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
-	return UCODE_ERROR;
-}
-
-static void microcode_fini_cpu_amd(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	uci->mc = NULL;
-}
-
-static struct microcode_ops microcode_amd_ops = {
-	.request_microcode_user           = request_microcode_user,
-	.request_microcode_fw             = request_microcode_amd,
-	.collect_cpu_info                 = collect_cpu_info_amd,
-	.apply_microcode                  = apply_microcode_amd,
-	.microcode_fini_cpu               = microcode_fini_cpu_amd,
-};
-
-struct microcode_ops * __init init_amd_microcode(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-		pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
-		return NULL;
-	}
-
-	return &microcode_amd_ops;
-}
-
-void __exit exit_amd_microcode(void)
-{
-	cleanup();
-}
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c
deleted file mode 100644
index 8384c0f..0000000
--- a/arch/x86/kernel/microcode_amd_early.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- * Fixes: Borislav Petkov <bp@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-
-#include <asm/cpu.h>
-#include <asm/setup.h>
-#include <asm/microcode_amd.h>
-
-/*
- * This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
- */
-static u8 *container;
-static size_t container_size;
-
-static u32 ucode_new_rev;
-u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static u16 this_equiv_id;
-
-struct cpio_data ucode_cpio;
-
-/*
- * Microcode patch container file is prepended to the initrd in cpio format.
- * See Documentation/x86/early-microcode.txt
- */
-static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
-static struct cpio_data __init find_ucode_in_initrd(void)
-{
-	long offset = 0;
-	char *path;
-	void *start;
-	size_t size;
-
-#ifdef CONFIG_X86_32
-	struct boot_params *p;
-
-	/*
-	 * On 32-bit, early load occurs before paging is turned on so we need
-	 * to use physical addresses.
-	 */
-	p       = (struct boot_params *)__pa_nodebug(&boot_params);
-	path    = (char *)__pa_nodebug(ucode_path);
-	start   = (void *)p->hdr.ramdisk_image;
-	size    = p->hdr.ramdisk_size;
-#else
-	path    = ucode_path;
-	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
-	size    = boot_params.hdr.ramdisk_size;
-#endif
-
-	return find_cpio_data(path, start, size, &offset);
-}
-
-static size_t compute_container_size(u8 *data, u32 total_size)
-{
-	size_t size = 0;
-	u32 *header = (u32 *)data;
-
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return size;
-
-	size = header[2] + CONTAINER_HDR_SZ;
-	total_size -= size;
-	data += size;
-
-	while (total_size) {
-		u16 patch_size;
-
-		header = (u32 *)data;
-
-		if (header[0] != UCODE_UCODE_TYPE)
-			break;
-
-		/*
-		 * Sanity-check patch size.
-		 */
-		patch_size = header[1];
-		if (patch_size > PATCH_MAX_SIZE)
-			break;
-
-		size	   += patch_size + SECTION_HDR_SIZE;
-		data	   += patch_size + SECTION_HDR_SIZE;
-		total_size -= patch_size + SECTION_HDR_SIZE;
-	}
-
-	return size;
-}
-
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- */
-static void apply_ucode_in_initrd(void *ucode, size_t size)
-{
-	struct equiv_cpu_entry *eq;
-	size_t *cont_sz;
-	u32 *header;
-	u8  *data, **cont;
-	u16 eq_id = 0;
-	int offset, left;
-	u32 rev, eax, ebx, ecx, edx;
-	u32 *new_rev;
-
-#ifdef CONFIG_X86_32
-	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-	cont_sz = (size_t *)__pa_nodebug(&container_size);
-	cont	= (u8 **)__pa_nodebug(&container);
-#else
-	new_rev = &ucode_new_rev;
-	cont_sz = &container_size;
-	cont	= &container;
-#endif
-
-	data   = ucode;
-	left   = size;
-	header = (u32 *)data;
-
-	/* find equiv cpu table */
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return;
-
-	eax = 0x00000001;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	while (left > 0) {
-		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
-
-		*cont = data;
-
-		/* Advance past the container header */
-		offset = header[2] + CONTAINER_HDR_SZ;
-		data  += offset;
-		left  -= offset;
-
-		eq_id = find_equiv_id(eq, eax);
-		if (eq_id) {
-			this_equiv_id = eq_id;
-			*cont_sz = compute_container_size(*cont, left + offset);
-
-			/*
-			 * truncate how much we need to iterate over in the
-			 * ucode update loop below
-			 */
-			left = *cont_sz - offset;
-			break;
-		}
-
-		/*
-		 * support multiple container files appended together. if this
-		 * one does not have a matching equivalent cpu entry, we fast
-		 * forward to the next container file.
-		 */
-		while (left > 0) {
-			header = (u32 *)data;
-			if (header[0] == UCODE_MAGIC &&
-			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
-				break;
-
-			offset = header[1] + SECTION_HDR_SIZE;
-			data  += offset;
-			left  -= offset;
-		}
-
-		/* mark where the next microcode container file starts */
-		offset    = data - (u8 *)ucode;
-		ucode     = data;
-	}
-
-	if (!eq_id) {
-		*cont = NULL;
-		*cont_sz = 0;
-		return;
-	}
-
-	/* find ucode and update if needed */
-
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
-	while (left > 0) {
-		struct microcode_amd *mc;
-
-		header = (u32 *)data;
-		if (header[0] != UCODE_UCODE_TYPE || /* type */
-		    header[1] == 0)                  /* size */
-			break;
-
-		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-
-		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
-
-			if (!__apply_microcode_amd(mc)) {
-				rev = mc->hdr.patch_id;
-				*new_rev = rev;
-
-				/* save ucode patch */
-				memcpy(amd_ucode_patch, mc,
-				       min_t(u32, header[1], PATCH_MAX_SIZE));
-			}
-		}
-
-		offset  = header[1] + SECTION_HDR_SIZE;
-		data   += offset;
-		left   -= offset;
-	}
-}
-
-void __init load_ucode_amd_bsp(void)
-{
-	struct cpio_data cp;
-	void **data;
-	size_t *size;
-
-#ifdef CONFIG_X86_32
-	data =  (void **)__pa_nodebug(&ucode_cpio.data);
-	size = (size_t *)__pa_nodebug(&ucode_cpio.size);
-#else
-	data = &ucode_cpio.data;
-	size = &ucode_cpio.size;
-#endif
-
-	cp = find_ucode_in_initrd();
-	if (!cp.data)
-		return;
-
-	*data = cp.data;
-	*size = cp.size;
-
-	apply_ucode_in_initrd(cp.data, cp.size);
-}
-
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * which is used upon resume from suspend.
- */
-void load_ucode_amd_ap(void)
-{
-	struct microcode_amd *mc;
-	size_t *usize;
-	void **ucode;
-
-	mc = (struct microcode_amd *)__pa(amd_ucode_patch);
-	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
-		__apply_microcode_amd(mc);
-		return;
-	}
-
-	ucode = (void *)__pa_nodebug(&container);
-	usize = (size_t *)__pa_nodebug(&container_size);
-
-	if (!*ucode || !*usize)
-		return;
-
-	apply_ucode_in_initrd(*ucode, *usize);
-}
-
-static void __init collect_cpu_sig_on_bsp(void *arg)
-{
-	unsigned int cpu = smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-#else
-void load_ucode_amd_ap(void)
-{
-	unsigned int cpu = smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	struct equiv_cpu_entry *eq;
-	struct microcode_amd *mc;
-	u32 rev, eax;
-	u16 eq_id;
-
-	/* Exit if called on the BSP. */
-	if (!cpu)
-		return;
-
-	if (!container)
-		return;
-
-	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
-	uci->cpu_sig.rev = rev;
-	uci->cpu_sig.sig = eax;
-
-	eax = cpuid_eax(0x00000001);
-	eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
-
-	eq_id = find_equiv_id(eq, eax);
-	if (!eq_id)
-		return;
-
-	if (eq_id == this_equiv_id) {
-		mc = (struct microcode_amd *)amd_ucode_patch;
-
-		if (mc && rev < mc->hdr.patch_id) {
-			if (!__apply_microcode_amd(mc))
-				ucode_new_rev = mc->hdr.patch_id;
-		}
-
-	} else {
-		if (!ucode_cpio.data)
-			return;
-
-		/*
-		 * AP has a different equivalence ID than BSP, looks like
-		 * mixed-steppings silicon so go through the ucode blob anew.
-		 */
-		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
-	}
-}
-#endif
-
-int __init save_microcode_in_initrd_amd(void)
-{
-	enum ucode_state ret;
-	u32 eax;
-
-#ifdef CONFIG_X86_32
-	unsigned int bsp = boot_cpu_data.cpu_index;
-	struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
-	if (!uci->cpu_sig.sig)
-		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
-
-	/*
-	 * Take into account the fact that the ramdisk might get relocated
-	 * and therefore we need to recompute the container's position in
-	 * virtual memory space.
-	 */
-	container = (u8 *)(__va((u32)relocated_ramdisk) +
-			   ((u32)container - boot_params.hdr.ramdisk_image));
-#endif
-	if (ucode_new_rev)
-		pr_info("microcode: updated early to new patch_level=0x%08x\n",
-			ucode_new_rev);
-
-	if (!container)
-		return -EINVAL;
-
-	eax   = cpuid_eax(0x00000001);
-	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
-
-	ret = load_microcode_amd(eax, container, container_size);
-	if (ret != UCODE_OK)
-		return -EINVAL;
-
-	/*
-	 * This will be freed any msec now, stash patches for the current
-	 * family and switch to patch cache for cpu hotplug, etc later.
-	 */
-	container = NULL;
-	container_size = 0;
-
-	return 0;
-}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
deleted file mode 100644
index 15c9876..0000000
--- a/arch/x86/kernel/microcode_core.c
+++ /dev/null
@@ -1,645 +0,0 @@
-/*
- *	Intel CPU Microcode Update Driver for Linux
- *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
- *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/platform_device.h>
-#include <linux/miscdevice.h>
-#include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/syscore_ops.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-#include <asm/cpu_device_id.h>
-#include <asm/perf_event.h>
-
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION	"2.00"
-
-static struct microcode_ops	*microcode_ops;
-
-/*
- * Synchronization.
- *
- * All non cpu-hotplug-callback call sites use:
- *
- * - microcode_mutex to synchronize with each other;
- * - get/put_online_cpus() to synchronize with
- *   the cpu-hotplug-callback call sites.
- *
- * We guarantee that only a single cpu is being
- * updated at any particular moment of time.
- */
-static DEFINE_MUTEX(microcode_mutex);
-
-struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
-EXPORT_SYMBOL_GPL(ucode_cpu_info);
-
-/*
- * Operations that are run on a target cpu:
- */
-
-struct cpu_info_ctx {
-	struct cpu_signature	*cpu_sig;
-	int			err;
-};
-
-static void collect_cpu_info_local(void *arg)
-{
-	struct cpu_info_ctx *ctx = arg;
-
-	ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
-						   ctx->cpu_sig);
-}
-
-static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
-{
-	struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
-	int ret;
-
-	ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
-	if (!ret)
-		ret = ctx.err;
-
-	return ret;
-}
-
-static int collect_cpu_info(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	int ret;
-
-	memset(uci, 0, sizeof(*uci));
-
-	ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
-	if (!ret)
-		uci->valid = 1;
-
-	return ret;
-}
-
-struct apply_microcode_ctx {
-	int err;
-};
-
-static void apply_microcode_local(void *arg)
-{
-	struct apply_microcode_ctx *ctx = arg;
-
-	ctx->err = microcode_ops->apply_microcode(smp_processor_id());
-}
-
-static int apply_microcode_on_target(int cpu)
-{
-	struct apply_microcode_ctx ctx = { .err = 0 };
-	int ret;
-
-	ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
-	if (!ret)
-		ret = ctx.err;
-
-	return ret;
-}
-
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static int do_microcode_update(const void __user *buf, size_t size)
-{
-	int error = 0;
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-		enum ucode_state ustate;
-
-		if (!uci->valid)
-			continue;
-
-		ustate = microcode_ops->request_microcode_user(cpu, buf, size);
-		if (ustate == UCODE_ERROR) {
-			error = -1;
-			break;
-		} else if (ustate == UCODE_OK)
-			apply_microcode_on_target(cpu);
-	}
-
-	return error;
-}
-
-static int microcode_open(struct inode *inode, struct file *file)
-{
-	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
-}
-
-static ssize_t microcode_write(struct file *file, const char __user *buf,
-			       size_t len, loff_t *ppos)
-{
-	ssize_t ret = -EINVAL;
-
-	if ((len >> PAGE_SHIFT) > totalram_pages) {
-		pr_err("too much data (max %ld pages)\n", totalram_pages);
-		return ret;
-	}
-
-	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-
-	if (do_microcode_update(buf, len) == 0)
-		ret = (ssize_t)len;
-
-	if (ret > 0)
-		perf_check_microcode();
-
-	mutex_unlock(&microcode_mutex);
-	put_online_cpus();
-
-	return ret;
-}
-
-static const struct file_operations microcode_fops = {
-	.owner			= THIS_MODULE,
-	.write			= microcode_write,
-	.open			= microcode_open,
-	.llseek		= no_llseek,
-};
-
-static struct miscdevice microcode_dev = {
-	.minor			= MICROCODE_MINOR,
-	.name			= "microcode",
-	.nodename		= "cpu/microcode",
-	.fops			= &microcode_fops,
-};
-
-static int __init microcode_dev_init(void)
-{
-	int error;
-
-	error = misc_register(&microcode_dev);
-	if (error) {
-		pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
-		return error;
-	}
-
-	return 0;
-}
-
-static void __exit microcode_dev_exit(void)
-{
-	misc_deregister(&microcode_dev);
-}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-MODULE_ALIAS("devname:cpu/microcode");
-#else
-#define microcode_dev_init()	0
-#define microcode_dev_exit()	do { } while (0)
-#endif
-
-/* fake device for request_firmware */
-static struct platform_device	*microcode_pdev;
-
-static int reload_for_cpu(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	enum ucode_state ustate;
-	int err = 0;
-
-	if (!uci->valid)
-		return err;
-
-	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
-	if (ustate == UCODE_OK)
-		apply_microcode_on_target(cpu);
-	else
-		if (ustate == UCODE_ERROR)
-			err = -EINVAL;
-	return err;
-}
-
-static ssize_t reload_store(struct device *dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t size)
-{
-	unsigned long val;
-	int cpu;
-	ssize_t ret = 0, tmp_ret;
-
-	ret = kstrtoul(buf, 0, &val);
-	if (ret)
-		return ret;
-
-	if (val != 1)
-		return size;
-
-	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-	for_each_online_cpu(cpu) {
-		tmp_ret = reload_for_cpu(cpu);
-		if (tmp_ret != 0)
-			pr_warn("Error reloading microcode on CPU %d\n", cpu);
-
-		/* save retval of the first encountered reload error */
-		if (!ret)
-			ret = tmp_ret;
-	}
-	if (!ret)
-		perf_check_microcode();
-	mutex_unlock(&microcode_mutex);
-	put_online_cpus();
-
-	if (!ret)
-		ret = size;
-
-	return ret;
-}
-
-static ssize_t version_show(struct device *dev,
-			struct device_attribute *attr, char *buf)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
-}
-
-static ssize_t pf_show(struct device *dev,
-			struct device_attribute *attr, char *buf)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
-}
-
-static DEVICE_ATTR(reload, 0200, NULL, reload_store);
-static DEVICE_ATTR(version, 0400, version_show, NULL);
-static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
-
-static struct attribute *mc_default_attrs[] = {
-	&dev_attr_version.attr,
-	&dev_attr_processor_flags.attr,
-	NULL
-};
-
-static struct attribute_group mc_attr_group = {
-	.attrs			= mc_default_attrs,
-	.name			= "microcode",
-};
-
-static void microcode_fini_cpu(int cpu)
-{
-	microcode_ops->microcode_fini_cpu(cpu);
-}
-
-static enum ucode_state microcode_resume_cpu(int cpu)
-{
-	pr_debug("CPU%d updated upon resume\n", cpu);
-
-	if (apply_microcode_on_target(cpu))
-		return UCODE_ERROR;
-
-	return UCODE_OK;
-}
-
-static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
-{
-	enum ucode_state ustate;
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	if (uci && uci->valid)
-		return UCODE_OK;
-
-	if (collect_cpu_info(cpu))
-		return UCODE_ERROR;
-
-	/* --dimm. Trigger a delayed update? */
-	if (system_state != SYSTEM_RUNNING)
-		return UCODE_NFOUND;
-
-	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
-						     refresh_fw);
-
-	if (ustate == UCODE_OK) {
-		pr_debug("CPU%d updated upon init\n", cpu);
-		apply_microcode_on_target(cpu);
-	}
-
-	return ustate;
-}
-
-static enum ucode_state microcode_update_cpu(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	if (uci->valid)
-		return microcode_resume_cpu(cpu);
-
-	return microcode_init_cpu(cpu, false);
-}
-
-static int mc_device_add(struct device *dev, struct subsys_interface *sif)
-{
-	int err, cpu = dev->id;
-
-	if (!cpu_online(cpu))
-		return 0;
-
-	pr_debug("CPU%d added\n", cpu);
-
-	err = sysfs_create_group(&dev->kobj, &mc_attr_group);
-	if (err)
-		return err;
-
-	if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
-		return -EINVAL;
-
-	return err;
-}
-
-static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
-{
-	int cpu = dev->id;
-
-	if (!cpu_online(cpu))
-		return 0;
-
-	pr_debug("CPU%d removed\n", cpu);
-	microcode_fini_cpu(cpu);
-	sysfs_remove_group(&dev->kobj, &mc_attr_group);
-	return 0;
-}
-
-static struct subsys_interface mc_cpu_interface = {
-	.name			= "microcode",
-	.subsys			= &cpu_subsys,
-	.add_dev		= mc_device_add,
-	.remove_dev		= mc_device_remove,
-};
-
-/**
- * mc_bp_resume - Update boot CPU microcode during resume.
- */
-static void mc_bp_resume(void)
-{
-	int cpu = smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	if (uci->valid && uci->mc)
-		microcode_ops->apply_microcode(cpu);
-}
-
-static struct syscore_ops mc_syscore_ops = {
-	.resume			= mc_bp_resume,
-};
-
-static int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (unsigned long)hcpu;
-	struct device *dev;
-
-	dev = get_cpu_device(cpu);
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-		microcode_update_cpu(cpu);
-		pr_debug("CPU%d added\n", cpu);
-		/*
-		 * "break" is missing on purpose here because we want to fall
-		 * through in order to create the sysfs group.
-		 */
-
-	case CPU_DOWN_FAILED:
-		if (sysfs_create_group(&dev->kobj, &mc_attr_group))
-			pr_err("Failed to create group for CPU%d\n", cpu);
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/* Suspend is in progress, only remove the interface */
-		sysfs_remove_group(&dev->kobj, &mc_attr_group);
-		pr_debug("CPU%d removed\n", cpu);
-		break;
-
-	/*
-	 * case CPU_DEAD:
-	 *
-	 * When a CPU goes offline, don't free up or invalidate the copy of
-	 * the microcode in kernel memory, so that we can reuse it when the
-	 * CPU comes back online without unnecessarily requesting the userspace
-	 * for it again.
-	 */
-	}
-
-	/* The CPU refused to come up during a system resume */
-	if (action == CPU_UP_CANCELED_FROZEN)
-		microcode_fini_cpu(cpu);
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __refdata mc_cpu_notifier = {
-	.notifier_call	= mc_cpu_callback,
-};
-
-#ifdef MODULE
-/* Autoload on Intel and AMD systems */
-static const struct x86_cpu_id __initconst microcode_id[] = {
-#ifdef CONFIG_MICROCODE_INTEL
-	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
-#ifdef CONFIG_MICROCODE_AMD
-	{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, microcode_id);
-#endif
-
-static struct attribute *cpu_root_microcode_attrs[] = {
-	&dev_attr_reload.attr,
-	NULL
-};
-
-static struct attribute_group cpu_root_microcode_group = {
-	.name  = "microcode",
-	.attrs = cpu_root_microcode_attrs,
-};
-
-static int __init microcode_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	int error;
-
-	if (c->x86_vendor == X86_VENDOR_INTEL)
-		microcode_ops = init_intel_microcode();
-	else if (c->x86_vendor == X86_VENDOR_AMD)
-		microcode_ops = init_amd_microcode();
-	else
-		pr_err("no support for this CPU vendor\n");
-
-	if (!microcode_ops)
-		return -ENODEV;
-
-	microcode_pdev = platform_device_register_simple("microcode", -1,
-							 NULL, 0);
-	if (IS_ERR(microcode_pdev))
-		return PTR_ERR(microcode_pdev);
-
-	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-
-	error = subsys_interface_register(&mc_cpu_interface);
-	if (!error)
-		perf_check_microcode();
-	mutex_unlock(&microcode_mutex);
-	put_online_cpus();
-
-	if (error)
-		goto out_pdev;
-
-	error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
-				   &cpu_root_microcode_group);
-
-	if (error) {
-		pr_err("Error creating microcode group!\n");
-		goto out_driver;
-	}
-
-	error = microcode_dev_init();
-	if (error)
-		goto out_ucode_group;
-
-	register_syscore_ops(&mc_syscore_ops);
-	register_hotcpu_notifier(&mc_cpu_notifier);
-
-	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
-		" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
-
-	return 0;
-
- out_ucode_group:
-	sysfs_remove_group(&cpu_subsys.dev_root->kobj,
-			   &cpu_root_microcode_group);
-
- out_driver:
-	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-
-	subsys_interface_unregister(&mc_cpu_interface);
-
-	mutex_unlock(&microcode_mutex);
-	put_online_cpus();
-
- out_pdev:
-	platform_device_unregister(microcode_pdev);
-	return error;
-
-}
-module_init(microcode_init);
-
-static void __exit microcode_exit(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	microcode_dev_exit();
-
-	unregister_hotcpu_notifier(&mc_cpu_notifier);
-	unregister_syscore_ops(&mc_syscore_ops);
-
-	sysfs_remove_group(&cpu_subsys.dev_root->kobj,
-			   &cpu_root_microcode_group);
-
-	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-
-	subsys_interface_unregister(&mc_cpu_interface);
-
-	mutex_unlock(&microcode_mutex);
-	put_online_cpus();
-
-	platform_device_unregister(microcode_pdev);
-
-	microcode_ops = NULL;
-
-	if (c->x86_vendor == X86_VENDOR_AMD)
-		exit_amd_microcode();
-
-	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
-}
-module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c
deleted file mode 100644
index be7f851..0000000
--- a/arch/x86/kernel/microcode_core_early.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- *	X86 CPU microcode early update for Linux
- *
- *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- *			   H Peter Anvin" <hpa@zytor.com>
- *
- *	This driver allows to early upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- *	Software Developer's Manual.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <asm/microcode_intel.h>
-#include <asm/microcode_amd.h>
-#include <asm/processor.h>
-
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
-
-#define CPUID_IS(a, b, c, ebx, ecx, edx)	\
-		(!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
-
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
- *
- * x86_vendor() gets vendor information directly through cpuid.
- */
-static int x86_vendor(void)
-{
-	u32 eax = 0x00000000;
-	u32 ebx, ecx = 0, edx;
-
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
-		return X86_VENDOR_INTEL;
-
-	if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
-		return X86_VENDOR_AMD;
-
-	return X86_VENDOR_UNKNOWN;
-}
-
-static int x86_family(void)
-{
-	u32 eax = 0x00000001;
-	u32 ebx, ecx = 0, edx;
-	int x86;
-
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	x86 = (eax >> 8) & 0xf;
-	if (x86 == 15)
-		x86 += (eax >> 20) & 0xff;
-
-	return x86;
-}
-
-void __init load_ucode_bsp(void)
-{
-	int vendor, x86;
-
-	if (!have_cpuid_p())
-		return;
-
-	vendor = x86_vendor();
-	x86 = x86_family();
-
-	switch (vendor) {
-	case X86_VENDOR_INTEL:
-		if (x86 >= 6)
-			load_ucode_intel_bsp();
-		break;
-	case X86_VENDOR_AMD:
-		if (x86 >= 0x10)
-			load_ucode_amd_bsp();
-		break;
-	default:
-		break;
-	}
-}
-
-void load_ucode_ap(void)
-{
-	int vendor, x86;
-
-	if (!have_cpuid_p())
-		return;
-
-	vendor = x86_vendor();
-	x86 = x86_family();
-
-	switch (vendor) {
-	case X86_VENDOR_INTEL:
-		if (x86 >= 6)
-			load_ucode_intel_ap();
-		break;
-	case X86_VENDOR_AMD:
-		if (x86 >= 0x10)
-			load_ucode_amd_ap();
-		break;
-	default:
-		break;
-	}
-}
-
-int __init save_microcode_in_initrd(void)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	switch (c->x86_vendor) {
-	case X86_VENDOR_INTEL:
-		if (c->x86 >= 6)
-			save_microcode_in_initrd_intel();
-		break;
-	case X86_VENDOR_AMD:
-		if (c->x86 >= 0x10)
-			save_microcode_in_initrd_amd();
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
deleted file mode 100644
index 5fb2ceb..0000000
--- a/arch/x86/kernel/microcode_intel.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- *	Intel CPU Microcode Update Driver for Linux
- *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
- *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
-{
-	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-	unsigned int val[2];
-
-	memset(csig, 0, sizeof(*csig));
-
-	csig->sig = cpuid_eax(0x00000001);
-
-	if ((c->x86_model >= 5) || (c->x86 > 6)) {
-		/* get processor flags from MSR 0x17 */
-		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-		csig->pf = 1 << ((val[1] >> 18) & 7);
-	}
-
-	csig->rev = c->microcode;
-	pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
-		cpu_num, csig->sig, csig->pf, csig->rev);
-
-	return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
-{
-	struct cpu_signature cpu_sig;
-	unsigned int csig, cpf, crev;
-
-	collect_cpu_info(cpu, &cpu_sig);
-
-	csig = cpu_sig.sig;
-	cpf = cpu_sig.pf;
-	crev = cpu_sig.rev;
-
-	return get_matching_microcode(csig, cpf, mc_intel, crev);
-}
-
-int apply_microcode(int cpu)
-{
-	struct microcode_intel *mc_intel;
-	struct ucode_cpu_info *uci;
-	unsigned int val[2];
-	int cpu_num = raw_smp_processor_id();
-	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-
-	uci = ucode_cpu_info + cpu;
-	mc_intel = uci->mc;
-
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu_num != cpu);
-
-	if (mc_intel == NULL)
-		return 0;
-
-	/*
-	 * Microcode on this CPU could be updated earlier. Only apply the
-	 * microcode patch in mc_intel when it is newer than the one on this
-	 * CPU.
-	 */
-	if (get_matching_mc(mc_intel, cpu) == 0)
-		return 0;
-
-	/* write microcode via MSR 0x79 */
-	wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc_intel->bits,
-	      (unsigned long) mc_intel->bits >> 16 >> 16);
-	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
-
-	/* get the current revision from MSR 0x8B */
-	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-	if (val[1] != mc_intel->hdr.rev) {
-		pr_err("CPU%d update to revision 0x%x failed\n",
-		       cpu_num, mc_intel->hdr.rev);
-		return -1;
-	}
-	pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
-		cpu_num, val[1],
-		mc_intel->hdr.date & 0xffff,
-		mc_intel->hdr.date >> 24,
-		(mc_intel->hdr.date >> 16) & 0xff);
-
-	uci->cpu_sig.rev = val[1];
-	c->microcode = val[1];
-
-	return 0;
-}
-
-static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
-				int (*get_ucode_data)(void *, const void *, size_t))
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
-	int new_rev = uci->cpu_sig.rev;
-	unsigned int leftover = size;
-	enum ucode_state state = UCODE_OK;
-	unsigned int curr_mc_size = 0;
-	unsigned int csig, cpf;
-
-	while (leftover) {
-		struct microcode_header_intel mc_header;
-		unsigned int mc_size;
-
-		if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
-			break;
-
-		mc_size = get_totalsize(&mc_header);
-		if (!mc_size || mc_size > leftover) {
-			pr_err("error! Bad data in microcode data file\n");
-			break;
-		}
-
-		/* For performance reasons, reuse mc area when possible */
-		if (!mc || mc_size > curr_mc_size) {
-			vfree(mc);
-			mc = vmalloc(mc_size);
-			if (!mc)
-				break;
-			curr_mc_size = mc_size;
-		}
-
-		if (get_ucode_data(mc, ucode_ptr, mc_size) ||
-		    microcode_sanity_check(mc, 1) < 0) {
-			break;
-		}
-
-		csig = uci->cpu_sig.sig;
-		cpf = uci->cpu_sig.pf;
-		if (get_matching_microcode(csig, cpf, mc, new_rev)) {
-			vfree(new_mc);
-			new_rev = mc_header.rev;
-			new_mc  = mc;
-			mc = NULL;	/* trigger new vmalloc */
-		}
-
-		ucode_ptr += mc_size;
-		leftover  -= mc_size;
-	}
-
-	vfree(mc);
-
-	if (leftover) {
-		vfree(new_mc);
-		state = UCODE_ERROR;
-		goto out;
-	}
-
-	if (!new_mc) {
-		state = UCODE_NFOUND;
-		goto out;
-	}
-
-	vfree(uci->mc);
-	uci->mc = (struct microcode_intel *)new_mc;
-
-	/*
-	 * If early loading microcode is supported, save this mc into
-	 * permanent memory. So it will be loaded early when a CPU is hot added
-	 * or resumes.
-	 */
-	save_mc_for_early(new_mc);
-
-	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
-		 cpu, new_rev, uci->cpu_sig.rev);
-out:
-	return state;
-}
-
-static int get_ucode_fw(void *to, const void *from, size_t n)
-{
-	memcpy(to, from, n);
-	return 0;
-}
-
-static enum ucode_state request_microcode_fw(int cpu, struct device *device,
-					     bool refresh_fw)
-{
-	char name[30];
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	const struct firmware *firmware;
-	enum ucode_state ret;
-
-	sprintf(name, "intel-ucode/%02x-%02x-%02x",
-		c->x86, c->x86_model, c->x86_mask);
-
-	if (request_firmware(&firmware, name, device)) {
-		pr_debug("data file %s load failed\n", name);
-		return UCODE_NFOUND;
-	}
-
-	ret = generic_load_microcode(cpu, (void *)firmware->data,
-				     firmware->size, &get_ucode_fw);
-
-	release_firmware(firmware);
-
-	return ret;
-}
-
-static int get_ucode_user(void *to, const void *from, size_t n)
-{
-	return copy_from_user(to, from, n);
-}
-
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
-	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
-}
-
-static void microcode_fini_cpu(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	vfree(uci->mc);
-	uci->mc = NULL;
-}
-
-static struct microcode_ops microcode_intel_ops = {
-	.request_microcode_user		  = request_microcode_user,
-	.request_microcode_fw             = request_microcode_fw,
-	.collect_cpu_info                 = collect_cpu_info,
-	.apply_microcode                  = apply_microcode,
-	.microcode_fini_cpu               = microcode_fini_cpu,
-};
-
-struct microcode_ops * __init init_intel_microcode(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-	    cpu_has(c, X86_FEATURE_IA64)) {
-		pr_err("Intel CPU family 0x%x not supported\n", c->x86);
-		return NULL;
-	}
-
-	return &microcode_intel_ops;
-}
-
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c
deleted file mode 100644
index 18f7391..0000000
--- a/arch/x86/kernel/microcode_intel_early.c
+++ /dev/null
@@ -1,787 +0,0 @@
-/*
- *	Intel CPU microcode early update for Linux
- *
- *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- *			   H Peter Anvin" <hpa@zytor.com>
- *
- *	This allows to early upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- *	Software Developer's Manual.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-#include <linux/cpu.h>
-#include <asm/msr.h>
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/setup.h>
-
-unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
-struct mc_saved_data {
-	unsigned int mc_saved_count;
-	struct microcode_intel **mc_saved;
-} mc_saved_data;
-
-static enum ucode_state
-generic_load_microcode_early(struct microcode_intel **mc_saved_p,
-			     unsigned int mc_saved_count,
-			     struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *ucode_ptr, *new_mc = NULL;
-	int new_rev = uci->cpu_sig.rev;
-	enum ucode_state state = UCODE_OK;
-	unsigned int mc_size;
-	struct microcode_header_intel *mc_header;
-	unsigned int csig = uci->cpu_sig.sig;
-	unsigned int cpf = uci->cpu_sig.pf;
-	int i;
-
-	for (i = 0; i < mc_saved_count; i++) {
-		ucode_ptr = mc_saved_p[i];
-
-		mc_header = (struct microcode_header_intel *)ucode_ptr;
-		mc_size = get_totalsize(mc_header);
-		if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
-			new_rev = mc_header->rev;
-			new_mc  = ucode_ptr;
-		}
-	}
-
-	if (!new_mc) {
-		state = UCODE_NFOUND;
-		goto out;
-	}
-
-	uci->mc = (struct microcode_intel *)new_mc;
-out:
-	return state;
-}
-
-static void
-microcode_pointer(struct microcode_intel **mc_saved,
-		  unsigned long *mc_saved_in_initrd,
-		  unsigned long initrd_start, int mc_saved_count)
-{
-	int i;
-
-	for (i = 0; i < mc_saved_count; i++)
-		mc_saved[i] = (struct microcode_intel *)
-			      (mc_saved_in_initrd[i] + initrd_start);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
-	       struct mc_saved_data *mc_saved_data)
-{
-	int i;
-	struct microcode_intel ***mc_saved;
-
-	mc_saved = (struct microcode_intel ***)
-		   __pa_nodebug(&mc_saved_data->mc_saved);
-	for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
-		struct microcode_intel *p;
-
-		p = *(struct microcode_intel **)
-			__pa_nodebug(mc_saved_data->mc_saved + i);
-		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
-	}
-}
-#endif
-
-static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data,
-	       unsigned long *mc_saved_in_initrd,
-	       unsigned long initrd_start,
-	       struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mc_saved_data->mc_saved_count;
-
-	if (!mc_saved_data->mc_saved) {
-		microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
-				  initrd_start, count);
-
-		return generic_load_microcode_early(mc_saved_tmp, count, uci);
-	} else {
-#ifdef CONFIG_X86_32
-		microcode_phys(mc_saved_tmp, mc_saved_data);
-		return generic_load_microcode_early(mc_saved_tmp, count, uci);
-#else
-		return generic_load_microcode_early(mc_saved_data->mc_saved,
-						    count, uci);
-#endif
-	}
-}
-
-static u8 get_x86_family(unsigned long sig)
-{
-	u8 x86;
-
-	x86 = (sig >> 8) & 0xf;
-
-	if (x86 == 0xf)
-		x86 += (sig >> 20) & 0xff;
-
-	return x86;
-}
-
-static u8 get_x86_model(unsigned long sig)
-{
-	u8 x86, x86_model;
-
-	x86 = get_x86_family(sig);
-	x86_model = (sig >> 4) & 0xf;
-
-	if (x86 == 0x6 || x86 == 0xf)
-		x86_model += ((sig >> 16) & 0xf) << 4;
-
-	return x86_model;
-}
-
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- */
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
-			unsigned long sig)
-{
-	u8 x86, x86_model;
-	u8 x86_ucode, x86_model_ucode;
-	struct extended_sigtable *ext_header;
-	unsigned long total_size = get_totalsize(mc_header);
-	unsigned long data_size = get_datasize(mc_header);
-	int ext_sigcount, i;
-	struct extended_signature *ext_sig;
-
-	x86 = get_x86_family(sig);
-	x86_model = get_x86_model(sig);
-
-	x86_ucode = get_x86_family(mc_header->sig);
-	x86_model_ucode = get_x86_model(mc_header->sig);
-
-	if (x86 == x86_ucode && x86_model == x86_model_ucode)
-		return UCODE_OK;
-
-	/* Look for ext. headers: */
-	if (total_size <= data_size + MC_HEADER_SIZE)
-		return UCODE_NFOUND;
-
-	ext_header = (struct extended_sigtable *)
-		     mc_header + data_size + MC_HEADER_SIZE;
-	ext_sigcount = ext_header->count;
-	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
-	for (i = 0; i < ext_sigcount; i++) {
-		x86_ucode = get_x86_family(ext_sig->sig);
-		x86_model_ucode = get_x86_model(ext_sig->sig);
-
-		if (x86 == x86_ucode && x86_model == x86_model_ucode)
-			return UCODE_OK;
-
-		ext_sig++;
-	}
-
-	return UCODE_NFOUND;
-}
-
-static int
-save_microcode(struct mc_saved_data *mc_saved_data,
-	       struct microcode_intel **mc_saved_src,
-	       unsigned int mc_saved_count)
-{
-	int i, j;
-	struct microcode_intel **mc_saved_p;
-	int ret;
-
-	if (!mc_saved_count)
-		return -EINVAL;
-
-	/*
-	 * Copy new microcode data.
-	 */
-	mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
-			     GFP_KERNEL);
-	if (!mc_saved_p)
-		return -ENOMEM;
-
-	for (i = 0; i < mc_saved_count; i++) {
-		struct microcode_intel *mc = mc_saved_src[i];
-		struct microcode_header_intel *mc_header = &mc->hdr;
-		unsigned long mc_size = get_totalsize(mc_header);
-		mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
-		if (!mc_saved_p[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
-		if (!mc_saved_src[i]) {
-			ret = -EINVAL;
-			goto err;
-		}
-		memcpy(mc_saved_p[i], mc, mc_size);
-	}
-
-	/*
-	 * Point to newly saved microcode.
-	 */
-	mc_saved_data->mc_saved = mc_saved_p;
-	mc_saved_data->mc_saved_count = mc_saved_count;
-
-	return 0;
-
-err:
-	for (j = 0; j <= i; j++)
-		kfree(mc_saved_p[j]);
-	kfree(mc_saved_p);
-
-	return ret;
-}
-
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- *   patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- */
-static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
-		     unsigned int *mc_saved_count_p)
-{
-	int i;
-	int found = 0;
-	unsigned int mc_saved_count = *mc_saved_count_p;
-	struct microcode_header_intel *mc_header;
-
-	mc_header = (struct microcode_header_intel *)ucode_ptr;
-	for (i = 0; i < mc_saved_count; i++) {
-		unsigned int sig, pf;
-		unsigned int new_rev;
-		struct microcode_header_intel *mc_saved_header =
-			     (struct microcode_header_intel *)mc_saved[i];
-		sig = mc_saved_header->sig;
-		pf = mc_saved_header->pf;
-		new_rev = mc_header->rev;
-
-		if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
-			found = 1;
-			if (update_match_revision(mc_header, new_rev)) {
-				/*
-				 * Found an older ucode saved before.
-				 * Replace the older one with this newer
-				 * one.
-				 */
-				mc_saved[i] =
-					(struct microcode_intel *)ucode_ptr;
-				break;
-			}
-		}
-	}
-	if (i >= mc_saved_count && !found)
-		/*
-		 * This ucode is first time discovered in ucode file.
-		 * Save it to memory.
-		 */
-		mc_saved[mc_saved_count++] =
-				 (struct microcode_intel *)ucode_ptr;
-
-	*mc_saved_count_p = mc_saved_count;
-}
-
-/*
- * Get microcode matching with BSP's model. Only CPUs with the same model as
- * BSP can stay in the platform.
- */
-static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
-			     void *data, size_t size,
-			     struct mc_saved_data *mc_saved_data,
-			     unsigned long *mc_saved_in_initrd,
-			     struct ucode_cpu_info *uci)
-{
-	u8 *ucode_ptr = data;
-	unsigned int leftover = size;
-	enum ucode_state state = UCODE_OK;
-	unsigned int mc_size;
-	struct microcode_header_intel *mc_header;
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
-	int i;
-
-	while (leftover) {
-		mc_header = (struct microcode_header_intel *)ucode_ptr;
-
-		mc_size = get_totalsize(mc_header);
-		if (!mc_size || mc_size > leftover ||
-			microcode_sanity_check(ucode_ptr, 0) < 0)
-			break;
-
-		leftover -= mc_size;
-
-		/*
-		 * Since APs with same family and model as the BSP may boot in
-		 * the platform, we need to find and save microcode patches
-		 * with the same family and model as the BSP.
-		 */
-		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
-			 UCODE_OK) {
-			ucode_ptr += mc_size;
-			continue;
-		}
-
-		_save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
-
-		ucode_ptr += mc_size;
-	}
-
-	if (leftover) {
-		state = UCODE_ERROR;
-		goto out;
-	}
-
-	if (mc_saved_count == 0) {
-		state = UCODE_NFOUND;
-		goto out;
-	}
-
-	for (i = 0; i < mc_saved_count; i++)
-		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
-
-	mc_saved_data->mc_saved_count = mc_saved_count;
-out:
-	return state;
-}
-
-static int collect_cpu_info_early(struct ucode_cpu_info *uci)
-{
-	unsigned int val[2];
-	u8 x86, x86_model;
-	struct cpu_signature csig;
-	unsigned int eax, ebx, ecx, edx;
-
-	csig.sig = 0;
-	csig.pf = 0;
-	csig.rev = 0;
-
-	memset(uci, 0, sizeof(*uci));
-
-	eax = 0x00000001;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-	csig.sig = eax;
-
-	x86 = get_x86_family(csig.sig);
-	x86_model = get_x86_model(csig.sig);
-
-	if ((x86_model >= 5) || (x86 > 6)) {
-		/* get processor flags from MSR 0x17 */
-		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-		csig.pf = 1 << ((val[1] >> 18) & 7);
-	}
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
-
-	/* get the current revision from MSR 0x8B */
-	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-	csig.rev = val[1];
-
-	uci->cpu_sig = csig;
-	uci->valid = 1;
-
-	return 0;
-}
-
-#ifdef DEBUG
-static void __ref show_saved_mc(void)
-{
-	int i, j;
-	unsigned int sig, pf, rev, total_size, data_size, date;
-	struct ucode_cpu_info uci;
-
-	if (mc_saved_data.mc_saved_count == 0) {
-		pr_debug("no micorcode data saved.\n");
-		return;
-	}
-	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
-
-	collect_cpu_info_early(&uci);
-
-	sig = uci.cpu_sig.sig;
-	pf = uci.cpu_sig.pf;
-	rev = uci.cpu_sig.rev;
-	pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
-		 smp_processor_id(), sig, pf, rev);
-
-	for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
-		struct microcode_header_intel *mc_saved_header;
-		struct extended_sigtable *ext_header;
-		int ext_sigcount;
-		struct extended_signature *ext_sig;
-
-		mc_saved_header = (struct microcode_header_intel *)
-				  mc_saved_data.mc_saved[i];
-		sig = mc_saved_header->sig;
-		pf = mc_saved_header->pf;
-		rev = mc_saved_header->rev;
-		total_size = get_totalsize(mc_saved_header);
-		data_size = get_datasize(mc_saved_header);
-		date = mc_saved_header->date;
-
-		pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
-			 i, sig, pf, rev, total_size,
-			 date & 0xffff,
-			 date >> 24,
-			 (date >> 16) & 0xff);
-
-		/* Look for ext. headers: */
-		if (total_size <= data_size + MC_HEADER_SIZE)
-			continue;
-
-		ext_header = (struct extended_sigtable *)
-			     mc_saved_header + data_size + MC_HEADER_SIZE;
-		ext_sigcount = ext_header->count;
-		ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
-		for (j = 0; j < ext_sigcount; j++) {
-			sig = ext_sig->sig;
-			pf = ext_sig->pf;
-
-			pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
-				 j, sig, pf);
-
-			ext_sig++;
-		}
-
-	}
-}
-#else
-static inline void show_saved_mc(void)
-{
-}
-#endif
-
-#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
-static DEFINE_MUTEX(x86_cpu_microcode_mutex);
-/*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
- */
-int save_mc_for_early(u8 *mc)
-{
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count_init;
-	unsigned int mc_saved_count;
-	struct microcode_intel **mc_saved;
-	int ret = 0;
-	int i;
-
-	/*
-	 * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
-	 * hotplug.
-	 */
-	mutex_lock(&x86_cpu_microcode_mutex);
-
-	mc_saved_count_init = mc_saved_data.mc_saved_count;
-	mc_saved_count = mc_saved_data.mc_saved_count;
-	mc_saved = mc_saved_data.mc_saved;
-
-	if (mc_saved && mc_saved_count)
-		memcpy(mc_saved_tmp, mc_saved,
-		       mc_saved_count * sizeof(struct mirocode_intel *));
-	/*
-	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
-	 * version.
-	 */
-
-	_save_mc(mc_saved_tmp, mc, &mc_saved_count);
-
-	/*
-	 * Save the mc_save_tmp in global mc_saved_data.
-	 */
-	ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
-	if (ret) {
-		pr_err("Cannot save microcode patch.\n");
-		goto out;
-	}
-
-	show_saved_mc();
-
-	/*
-	 * Free old saved microcod data.
-	 */
-	if (mc_saved) {
-		for (i = 0; i < mc_saved_count_init; i++)
-			kfree(mc_saved[i]);
-		kfree(mc_saved);
-	}
-
-out:
-	mutex_unlock(&x86_cpu_microcode_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(save_mc_for_early);
-#endif
-
-static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
-static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long end,
-		struct mc_saved_data *mc_saved_data,
-		unsigned long *mc_saved_in_initrd,
-		struct ucode_cpu_info *uci)
-{
-	unsigned int size = end - start + 1;
-	struct cpio_data cd;
-	long offset = 0;
-#ifdef CONFIG_X86_32
-	char *p = (char *)__pa_nodebug(ucode_name);
-#else
-	char *p = ucode_name;
-#endif
-
-	cd.data = NULL;
-	cd.size = 0;
-
-	cd = find_cpio_data(p, (void *)start, size, &offset);
-	if (!cd.data)
-		return UCODE_ERROR;
-
-
-	return get_matching_model_microcode(0, start, cd.data, cd.size,
-					    mc_saved_data, mc_saved_in_initrd,
-					    uci);
-}
-
-/*
- * Print ucode update info.
- */
-static void
-print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
-{
-	int cpu = smp_processor_id();
-
-	pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
-		cpu,
-		uci->cpu_sig.rev,
-		date & 0xffff,
-		date >> 24,
-		(date >> 16) & 0xff);
-}
-
-#ifdef CONFIG_X86_32
-
-static int delay_ucode_info;
-static int current_mc_date;
-
-/*
- * Print early updated ucode info after printk works. This is delayed info dump.
- */
-void show_ucode_info_early(void)
-{
-	struct ucode_cpu_info uci;
-
-	if (delay_ucode_info) {
-		collect_cpu_info_early(&uci);
-		print_ucode_info(&uci, current_mc_date);
-		delay_ucode_info = 0;
-	}
-}
-
-/*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
- * show_ucode_info_early() until printk() works.
- */
-static void print_ucode(struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *mc_intel;
-	int *delay_ucode_info_p;
-	int *current_mc_date_p;
-
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
-		return;
-
-	delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
-	current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
-
-	*delay_ucode_info_p = 1;
-	*current_mc_date_p = mc_intel->hdr.date;
-}
-#else
-
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
-	__native_flush_tlb_global_irq_disabled();
-}
-
-static inline void print_ucode(struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *mc_intel;
-
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
-		return;
-
-	print_ucode_info(uci, mc_intel->hdr.date);
-}
-#endif
-
-static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
-				 struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *mc_intel;
-	unsigned int val[2];
-
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
-		return 0;
-
-	/* write microcode via MSR 0x79 */
-	native_wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc_intel->bits,
-	      (unsigned long) mc_intel->bits >> 16 >> 16);
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
-
-	/* get the current revision from MSR 0x8B */
-	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-	if (val[1] != mc_intel->hdr.rev)
-		return -1;
-
-#ifdef CONFIG_X86_64
-	/* Flush global tlb. This is precaution. */
-	flush_tlb_early();
-#endif
-	uci->cpu_sig.rev = val[1];
-
-	print_ucode(uci);
-
-	return 0;
-}
-
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
- */
-int __init save_microcode_in_initrd_intel(void)
-{
-	unsigned int count = mc_saved_data.mc_saved_count;
-	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
-	int ret = 0;
-
-	if (count == 0)
-		return ret;
-
-	microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
-	ret = save_microcode(&mc_saved_data, mc_saved, count);
-	if (ret)
-		pr_err("Cannot save microcode patches from initrd.\n");
-
-	show_saved_mc();
-
-	return ret;
-}
-
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
-		      unsigned long *mc_saved_in_initrd,
-		      unsigned long initrd_start_early,
-		      unsigned long initrd_end_early,
-		      struct ucode_cpu_info *uci)
-{
-	collect_cpu_info_early(uci);
-	scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
-		       mc_saved_in_initrd, uci);
-	load_microcode(mc_saved_data, mc_saved_in_initrd,
-		       initrd_start_early, uci);
-	apply_microcode_early(mc_saved_data, uci);
-}
-
-void __init
-load_ucode_intel_bsp(void)
-{
-	u64 ramdisk_image, ramdisk_size;
-	unsigned long initrd_start_early, initrd_end_early;
-	struct ucode_cpu_info uci;
-#ifdef CONFIG_X86_32
-	struct boot_params *boot_params_p;
-
-	boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
-	ramdisk_image = boot_params_p->hdr.ramdisk_image;
-	ramdisk_size  = boot_params_p->hdr.ramdisk_size;
-	initrd_start_early = ramdisk_image;
-	initrd_end_early = initrd_start_early + ramdisk_size;
-
-	_load_ucode_intel_bsp(
-		(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-		(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-		initrd_start_early, initrd_end_early, &uci);
-#else
-	ramdisk_image = boot_params.hdr.ramdisk_image;
-	ramdisk_size  = boot_params.hdr.ramdisk_size;
-	initrd_start_early = ramdisk_image + PAGE_OFFSET;
-	initrd_end_early = initrd_start_early + ramdisk_size;
-
-	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
-			      initrd_start_early, initrd_end_early, &uci);
-#endif
-}
-
-void load_ucode_intel_ap(void)
-{
-	struct mc_saved_data *mc_saved_data_p;
-	struct ucode_cpu_info uci;
-	unsigned long *mc_saved_in_initrd_p;
-	unsigned long initrd_start_addr;
-#ifdef CONFIG_X86_32
-	unsigned long *initrd_start_p;
-
-	mc_saved_in_initrd_p =
-		(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
-	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
-	initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
-#else
-	mc_saved_data_p = &mc_saved_data;
-	mc_saved_in_initrd_p = mc_saved_in_initrd;
-	initrd_start_addr = initrd_start;
-#endif
-
-	/*
-	 * If there is no valid ucode previously saved in memory, no need to
-	 * update ucode on this AP.
-	 */
-	if (mc_saved_data_p->mc_saved_count == 0)
-		return;
-
-	collect_cpu_info_early(&uci);
-	load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
-		       initrd_start_addr, &uci);
-	apply_microcode_early(mc_saved_data_p, &uci);
-}
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/microcode_intel_lib.c
deleted file mode 100644
index ce69320..0000000
--- a/arch/x86/kernel/microcode_intel_lib.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- *	Intel CPU Microcode Update Driver for Linux
- *
- *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- *			   H Peter Anvin" <hpa@zytor.com>
- *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- */
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-static inline int
-update_match_cpu(unsigned int csig, unsigned int cpf,
-		 unsigned int sig, unsigned int pf)
-{
-	return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
-}
-
-int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
-	return (mc_header->rev <= rev) ? 0 : 1;
-}
-
-int microcode_sanity_check(void *mc, int print_err)
-{
-	unsigned long total_size, data_size, ext_table_size;
-	struct microcode_header_intel *mc_header = mc;
-	struct extended_sigtable *ext_header = NULL;
-	int sum, orig_sum, ext_sigcount = 0, i;
-	struct extended_signature *ext_sig;
-
-	total_size = get_totalsize(mc_header);
-	data_size = get_datasize(mc_header);
-
-	if (data_size + MC_HEADER_SIZE > total_size) {
-		if (print_err)
-			pr_err("error! Bad data size in microcode data file\n");
-		return -EINVAL;
-	}
-
-	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
-		if (print_err)
-			pr_err("error! Unknown microcode update format\n");
-		return -EINVAL;
-	}
-	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-	if (ext_table_size) {
-		if ((ext_table_size < EXT_HEADER_SIZE)
-		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-			if (print_err)
-				pr_err("error! Small exttable size in microcode data file\n");
-			return -EINVAL;
-		}
-		ext_header = mc + MC_HEADER_SIZE + data_size;
-		if (ext_table_size != exttable_size(ext_header)) {
-			if (print_err)
-				pr_err("error! Bad exttable size in microcode data file\n");
-			return -EFAULT;
-		}
-		ext_sigcount = ext_header->count;
-	}
-
-	/* check extended table checksum */
-	if (ext_table_size) {
-		int ext_table_sum = 0;
-		int *ext_tablep = (int *)ext_header;
-
-		i = ext_table_size / DWSIZE;
-		while (i--)
-			ext_table_sum += ext_tablep[i];
-		if (ext_table_sum) {
-			if (print_err)
-				pr_warn("aborting, bad extended signature table checksum\n");
-			return -EINVAL;
-		}
-	}
-
-	/* calculate the checksum */
-	orig_sum = 0;
-	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
-	while (i--)
-		orig_sum += ((int *)mc)[i];
-	if (orig_sum) {
-		if (print_err)
-			pr_err("aborting, bad checksum\n");
-		return -EINVAL;
-	}
-	if (!ext_table_size)
-		return 0;
-	/* check extended signature checksum */
-	for (i = 0; i < ext_sigcount; i++) {
-		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
-			  EXT_SIGNATURE_SIZE * i;
-		sum = orig_sum
-			- (mc_header->sig + mc_header->pf + mc_header->cksum)
-			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
-		if (sum) {
-			if (print_err)
-				pr_err("aborting, bad checksum\n");
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(microcode_sanity_check);
-
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
-{
-	struct microcode_header_intel *mc_header = mc;
-	struct extended_sigtable *ext_header;
-	unsigned long total_size = get_totalsize(mc_header);
-	int ext_sigcount, i;
-	struct extended_signature *ext_sig;
-
-	if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
-		return 1;
-
-	/* Look for ext. headers: */
-	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
-		return 0;
-
-	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-	ext_sigcount = ext_header->count;
-	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
-	for (i = 0; i < ext_sigcount; i++) {
-		if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
-			return 1;
-		ext_sig++;
-	}
-	return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
-{
-	struct microcode_header_intel *mc_header = mc;
-
-	if (!update_match_revision(mc_header, rev))
-		return 0;
-
-	return get_matching_sig(csig, cpf, mc, rev);
-}
-EXPORT_SYMBOL_GPL(get_matching_microcode);
-- 
cgit v1.1


From 60283df7ac26a4fe2d56631ca2946e04725e7eaf Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 14 Jan 2014 08:44:47 +0100
Subject: x86/apic: Read Error Status Register correctly

Currently we do a read, a dummy write and a final read to fetch
the error code. The value from the final read is taken.
This is not the recommended way and leads to corrupted/lost ESR
values.

Intel(c) 64 and IA-32 Architectures Software Developer's Manual,
Combined Volumes 1, 2ABC, 3ABC, Section 10.5.3 states:

  Before attempt to read from the ESR, software should first
  write to it. (The value written does not affect the values read
  subsequently; only zero may be written in x2APIC mode.) This
  write clears any previously logged errors and updates the ESR
  with any errors detected since the last write to the ESR.
  This write also rearms the APIC error interrupt triggering
  mechanism.

This patch removes the first read such that we are conform with
the manual.

On my (very old) Pentium MMX SMP system this patch fixes the
issue that APIC errors:

  a) are not always reported and
  b) are reported with false error numbers.

Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: seiji.aguchi@hds.com
Cc: rientjes@google.com
Cc: konrad.wilk@oracle.com
Cc: bp@alien8.de
Cc: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1389685487-20872-1-git-send-email-richard@nod.at
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d278736..4ec1dd6 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1968,7 +1968,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
  */
 static inline void __smp_error_interrupt(struct pt_regs *regs)
 {
-	u32 v0, v1;
+	u32 v;
 	u32 i = 0;
 	static const char * const error_interrupt_reason[] = {
 		"Send CS error",		/* APIC Error Bit 0 */
@@ -1982,21 +1982,20 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
 	};
 
 	/* First tickle the hardware, only then report what went on. -- REW */
-	v0 = apic_read(APIC_ESR);
 	apic_write(APIC_ESR, 0);
-	v1 = apic_read(APIC_ESR);
+	v = apic_read(APIC_ESR);
 	ack_APIC_irq();
 	atomic_inc(&irq_err_count);
 
-	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
-		    smp_processor_id(), v0 , v1);
+	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
+		    smp_processor_id(), v);
 
-	v1 = v1 & 0xff;
-	while (v1) {
-		if (v1 & 0x1)
+	v &= 0xff;
+	while (v) {
+		if (v & 0x1)
 			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
 		i++;
-		v1 >>= 1;
+		v >>= 1;
 	}
 
 	apic_printk(APIC_DEBUG, KERN_CONT "\n");
-- 
cgit v1.1


From 19259943f0954dcd1817f94776376bf51c6a46d5 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Sat, 7 Dec 2013 21:02:36 +0800
Subject: x86, kaslr: Remove unused including <linux/version.h>

Remove including <linux/version.h> that don't need it.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Link: http://lkml.kernel.org/r/CAPgLHd-Fjx1RybjWFAu1vHRfTvhWwMLL3x46BouC5uNxHPjy1A@mail.gmail.com
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/aslr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 84be175..90a21f4 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -10,7 +10,6 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <generated/utsrelease.h>
-#include <linux/version.h>
 
 /* Simplified build-specific string for starting entropy. */
 static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
-- 
cgit v1.1


From da2b6fb990cf782b18952f534ec7323453bc4fc9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Dec 2013 12:27:45 -0800
Subject: x86, kaslr: Clarify RANDOMIZE_BASE_MAX_OFFSET

The help text for RANDOMIZE_BASE_MAX_OFFSET was confusing. This has been
clarified, and updated to be an export-only tunable.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/20131210202745.GA2961@www.outflux.net
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 596cd9e..5c9e19d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1747,26 +1747,33 @@ config RANDOMIZE_BASE
 	   possible. At best, due to page table layouts, 64-bit can use
 	   9 bits of entropy and 32-bit uses 8 bits.
 
+	   If unsure, say N.
+
 config RANDOMIZE_BASE_MAX_OFFSET
-	hex "Maximum ASLR offset allowed"
+	hex "Maximum kASLR offset allowed" if EXPERT
 	depends on RANDOMIZE_BASE
 	range 0x0 0x20000000 if X86_32
 	default "0x20000000" if X86_32
 	range 0x0 0x40000000 if X86_64
 	default "0x40000000" if X86_64
 	---help---
-	 Determines the maximal offset in bytes that will be applied to the
-	 kernel when Address Space Layout Randomization (ASLR) is active.
-	 Must be less than or equal to the actual physical memory on the
-	 system. This must be a multiple of CONFIG_PHYSICAL_ALIGN.
+	  The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical
+	  memory is used to determine the maximal offset in bytes that will
+	  be applied to the kernel when kernel Address Space Layout
+	  Randomization (kASLR) is active. This must be a multiple of
+	  PHYSICAL_ALIGN.
+
+	  On 32-bit this is limited to 512MiB by page table layouts. The
+	  default is 512MiB.
 
-	 On 32-bit this is limited to 512MiB.
+	  On 64-bit this is limited by how the kernel fixmap page table is
+	  positioned, so this cannot be larger than 1GiB currently. Without
+	  RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel
+	  and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the
+	  modules area will shrink to compensate, up to the current maximum
+	  1GiB to 1GiB split. The default is 1GiB.
 
-	 On 64-bit this is limited by how the kernel fixmap page table is
-	 positioned, so this cannot be larger that 1GiB currently. Normally
-	 there is a 512MiB to 1.5GiB split between kernel and modules. When
-	 this is raised above the 512MiB default, the modules area will
-	 shrink to compensate, up to the current maximum 1GiB to 1GiB split.
+	  If unsure, leave at the default value.
 
 # Relocation on x86 needs some additional build support
 config X86_NEED_RELOCS
-- 
cgit v1.1


From d139336700a5f3a560da235e4dfcd286773025d4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 15 Jan 2014 12:52:15 +0100
Subject: x86, cpu, amd: Fix a shadowed variable situation

Having u32 and struct cpuinfo_x86 * by the same name is not very smart,
although it was ok in this case due to the limited scope of u32 c and it
being used only once in there.

Fix this.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1389786735-16751-1-git-send-email-bp@alien8.de
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/amd.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 39bc78d..e5647ab 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -789,14 +789,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
 	}
 
 	/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
-	if (!((eax >> 16) & mask)) {
-		u32 a, b, c, d;
-
-		cpuid(0x80000005, &a, &b, &c, &d);
-		tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
-	} else {
+	if (!((eax >> 16) & mask))
+		tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+	else
 		tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
-	}
 
 	/* a 4M entry uses two 2M entries */
 	tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
-- 
cgit v1.1


From 0dce7cd67fd9055c4a2ff278f8af1431e646d346 Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Wed, 15 Jan 2014 13:39:59 +0100
Subject: kvm: x86: fix apic_base enable check

Commit e66d2ae7c67bd moved the assignment
vcpu->arch.apic_base = value above a condition with
(vcpu->arch.apic_base ^ value), causing that check
to always fail. Use old_value, vcpu->arch.apic_base's
old value, in the condition instead.

Signed-off-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1673940..775702f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1355,7 +1355,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	vcpu->arch.apic_base = value;
 
 	/* update jump label if enable bit changes */
-	if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
 		if (value & MSR_IA32_APICBASE_ENABLE)
 			static_key_slow_dec_deferred(&apic_hw_disabled);
 		else
-- 
cgit v1.1


From 151e0c7de616310f95393d9306903900fcd8b277 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 15 Jan 2014 15:44:58 +0900
Subject: x86, apic, kexec: Add disable_cpu_apicid kernel parameter

Add disable_cpu_apicid kernel parameter. To use this kernel parameter,
specify an initial APIC ID of the corresponding CPU you want to
disable.

This is mostly used for the kdump 2nd kernel to disable BSP to wake up
multiple CPUs without causing system reset or hang due to sending INIT
from AP to BSP.

Kdump users first figure out initial APIC ID of the BSP, CPU0 in the
1st kernel, for example from /proc/cpuinfo and then set up this kernel
parameter for the 2nd kernel using the obtained APIC ID.

However, doing this procedure at each boot time manually is awkward,
which should be automatically done by user-land service scripts, for
example, kexec-tools on fedora/RHEL distributions.

This design is more flexible than disabling BSP in kernel boot time
automatically in that in kernel boot time we have no choice but
referring to ACPI/MP table to obtain initial APIC ID for BSP, meaning
that the method is not applicable to the systems without such BIOS
tables.

One assumption behind this design is that users get initial APIC ID of
the BSP in still healthy state and so BSP is uniquely kept in
CPU0. Thus, through the kernel parameter, only one initial APIC ID can
be specified.

In a comparison with disabled_cpu_apicid, we use read_apic_id(), not
boot_cpu_physical_apicid, because on some platforms, the variable is
modified to the apicid reported as BSP through MP table and this
function is executed with the temporarily modified
boot_cpu_physical_apicid. As a result, disabled_cpu_apicid kernel
parameter doesn't work well for apicids of APs.

Fixing the wrong handling of boot_cpu_physical_apicid requires some
reviews and tests beyond some platforms and it could take some
time. The fix here is a kind of workaround to focus on the main topic
of this patch.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Link: http://lkml.kernel.org/r/20140115064458.1545.38775.stgit@localhost6.localdomain6
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/apic.c | 49 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4ec1dd6..e78ab8c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,6 +75,13 @@ unsigned int max_physical_apicid;
 physid_mask_t phys_cpu_present_map;
 
 /*
+ * Processor to be disabled specified by kernel parameter
+ * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
+ * avoid undefined behaviour caused by sending INIT from AP to BSP.
+ */
+unsigned int disabled_cpu_apicid = BAD_APICID;
+
+/*
  * Map cpu index to physical APIC ID
  */
 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -2114,6 +2121,39 @@ int generic_processor_info(int apicid, int version)
 				phys_cpu_present_map);
 
 	/*
+	 * boot_cpu_physical_apicid is designed to have the apicid
+	 * returned by read_apic_id(), i.e, the apicid of the
+	 * currently booting-up processor. However, on some platforms,
+	 * it is temporarilly modified by the apicid reported as BSP
+	 * through MP table. Concretely:
+	 *
+	 * - arch/x86/kernel/mpparse.c: MP_processor_info()
+	 * - arch/x86/mm/amdtopology.c: amd_numa_init()
+	 * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info()
+	 *
+	 * This function is executed with the modified
+	 * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
+	 * parameter doesn't work to disable APs on kdump 2nd kernel.
+	 *
+	 * Since fixing handling of boot_cpu_physical_apicid requires
+	 * another discussion and tests on each platform, we leave it
+	 * for now and here we use read_apic_id() directly in this
+	 * function, generic_processor_info().
+	 */
+	if (disabled_cpu_apicid != BAD_APICID &&
+	    disabled_cpu_apicid != read_apic_id() &&
+	    disabled_cpu_apicid == apicid) {
+		int thiscpu = num_processors + disabled_cpus;
+
+		pr_warning("ACPI: Disabling requested cpu."
+			   " Processor %d/0x%x ignored.\n",
+			   thiscpu, apicid);
+
+		disabled_cpus++;
+		return -ENODEV;
+	}
+
+	/*
 	 * If boot cpu has not been detected yet, then only allow upto
 	 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
 	 */
@@ -2591,3 +2631,12 @@ static int __init lapic_insert_resource(void)
  * that is using request_resource
  */
 late_initcall(lapic_insert_resource);
+
+static int __init apic_set_disabled_cpu_apicid(char *arg)
+{
+	if (!arg || !get_option(&arg, &disabled_cpu_apicid))
+		return -EINVAL;
+
+	return 0;
+}
+early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
-- 
cgit v1.1


From 5b4d1dbc24bb6fd7179ada0f47be34e27e64decb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 15 Jan 2014 13:02:08 -0800
Subject: x86, apic: Make disabled_cpu_apicid static read_mostly, fix typos

Make disabled_cpu_apicid static and read_mostly, and fix a couple of
typos.

Reported-by: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/20140115182511.GA22737@gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
---
 arch/x86/kernel/apic/apic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e78ab8c..7f26c9a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -79,7 +79,7 @@ physid_mask_t phys_cpu_present_map;
  * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
  * avoid undefined behaviour caused by sending INIT from AP to BSP.
  */
-unsigned int disabled_cpu_apicid = BAD_APICID;
+static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
 
 /*
  * Map cpu index to physical APIC ID
@@ -2124,7 +2124,7 @@ int generic_processor_info(int apicid, int version)
 	 * boot_cpu_physical_apicid is designed to have the apicid
 	 * returned by read_apic_id(), i.e, the apicid of the
 	 * currently booting-up processor. However, on some platforms,
-	 * it is temporarilly modified by the apicid reported as BSP
+	 * it is temporarily modified by the apicid reported as BSP
 	 * through MP table. Concretely:
 	 *
 	 * - arch/x86/kernel/mpparse.c: MP_processor_info()
@@ -2145,7 +2145,7 @@ int generic_processor_info(int apicid, int version)
 	    disabled_cpu_apicid == apicid) {
 		int thiscpu = num_processors + disabled_cpus;
 
-		pr_warning("ACPI: Disabling requested cpu."
+		pr_warning("APIC: Disabling requested cpu."
 			   " Processor %d/0x%x ignored.\n",
 			   thiscpu, apicid);
 
-- 
cgit v1.1


From ecd6910db979bc40ac19f0e71e027132fc906068 Mon Sep 17 00:00:00 2001
From: David Cohen <david.a.cohen@linux.intel.com>
Date: Mon, 16 Dec 2013 12:07:36 -0800
Subject: x86, intel-mid: Move Medfield code out of intel-mid.c core file

In order make the driver more portable and support other Intel MID
(Mobile Internet Device) platforms we need to move Medfield code from
intel-mid.c core to its own mfld.c file.

This patch contains no functional changes.

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Link: http://lkml.kernel.org/r/1387224459-25746-2-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/intel-mid/Makefile               |  4 +-
 arch/x86/platform/intel-mid/intel-mid.c            | 37 +---------------
 arch/x86/platform/intel-mid/intel_mid_weak_decls.h | 15 +++++++
 arch/x86/platform/intel-mid/mfld.c                 | 51 ++++++++++++++++++++++
 4 files changed, 70 insertions(+), 37 deletions(-)
 create mode 100644 arch/x86/platform/intel-mid/intel_mid_weak_decls.h
 create mode 100644 arch/x86/platform/intel-mid/mfld.c

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 01cc29e..78a14ba 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,6 +1,6 @@
-obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o
-obj-$(CONFIG_X86_INTEL_MID) += intel_mid_vrtc.o
+obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o
 obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o
+
 # SFI specific code
 ifdef CONFIG_X86_INTEL_MID
 obj-$(CONFIG_SFI) += sfi.o device_libs/
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index f90e290..527d6d5 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -35,6 +35,8 @@
 #include <asm/apb_timer.h>
 #include <asm/reboot.h>
 
+#include "intel_mid_weak_decls.h"
+
 /*
  * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
  * cmdline option x86_intel_mid_timer can be used to override the configuration
@@ -61,46 +63,11 @@ enum intel_mid_timer_options intel_mid_timer_options;
 enum intel_mid_cpu_type __intel_mid_cpu_chip;
 EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
 
-static void intel_mid_power_off(void)
-{
-}
-
 static void intel_mid_reboot(void)
 {
 	intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
 }
 
-static unsigned long __init intel_mid_calibrate_tsc(void)
-{
-	unsigned long fast_calibrate;
-	u32 lo, hi, ratio, fsb;
-
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
-	ratio = (hi >> 8) & 0x1f;
-	pr_debug("ratio is %d\n", ratio);
-	if (!ratio) {
-		pr_err("read a zero ratio, should be incorrect!\n");
-		pr_err("force tsc ratio to 16 ...\n");
-		ratio = 16;
-	}
-	rdmsr(MSR_FSB_FREQ, lo, hi);
-	if ((lo & 0x7) == 0x7)
-		fsb = PENWELL_FSB_FREQ_83SKU;
-	else
-		fsb = PENWELL_FSB_FREQ_100SKU;
-	fast_calibrate = ratio * fsb;
-	pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
-	lapic_timer_frequency = fsb * 1000 / HZ;
-	/* mark tsc clocksource as reliable */
-	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
-
-	if (fast_calibrate)
-		return fast_calibrate;
-
-	return 0;
-}
-
 static void __init intel_mid_time_init(void)
 {
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
new file mode 100644
index 0000000..519beb7
--- /dev/null
+++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
@@ -0,0 +1,15 @@
+/*
+ * intel_mid_weak_decls.h: Weak declarations of intel-mid.c
+ *
+ * (C) Copyright 2013 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+
+/* __attribute__((weak)) makes these declarations overridable */
+extern void intel_mid_power_off(void) __attribute__((weak));
+extern unsigned long __init intel_mid_calibrate_tsc(void) __attribute__((weak));
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
new file mode 100644
index 0000000..c7ff83c
--- /dev/null
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -0,0 +1,51 @@
+/*
+ * mfld.c: Intel Medfield platform setup code
+ *
+ * (C) Copyright 2013 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+
+#include <asm/apic.h>
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
+
+void intel_mid_power_off(void)
+{
+}
+
+unsigned long __init intel_mid_calibrate_tsc(void)
+{
+	unsigned long fast_calibrate;
+	u32 lo, hi, ratio, fsb;
+
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
+	ratio = (hi >> 8) & 0x1f;
+	pr_debug("ratio is %d\n", ratio);
+	if (!ratio) {
+		pr_err("read a zero ratio, should be incorrect!\n");
+		pr_err("force tsc ratio to 16 ...\n");
+		ratio = 16;
+	}
+	rdmsr(MSR_FSB_FREQ, lo, hi);
+	if ((lo & 0x7) == 0x7)
+		fsb = PENWELL_FSB_FREQ_83SKU;
+	else
+		fsb = PENWELL_FSB_FREQ_100SKU;
+	fast_calibrate = ratio * fsb;
+	pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
+	lapic_timer_frequency = fsb * 1000 / HZ;
+	/* mark tsc clocksource as reliable */
+	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
+
+	if (fast_calibrate)
+		return fast_calibrate;
+
+	return 0;
+}
-- 
cgit v1.1


From 85611e3febe78955a519f5f9eb47b941525c8c76 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 16 Dec 2013 12:07:37 -0800
Subject: x86, intel-mid: Add Clovertrail platform support

This patch adds Clovertrail support on intel-mid and makes it more
flexible to support other SoCs.

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Link: http://lkml.kernel.org/r/1387224459-25746-3-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Fei Yang <fei.yang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/intel-mid.h                   | 46 +++++++++++++++++++++-
 arch/x86/platform/intel-mid/intel-mid.c            | 39 ++++++++++++++++--
 arch/x86/platform/intel-mid/intel_mid_weak_decls.h |  7 +++-
 arch/x86/platform/intel-mid/mfld.c                 | 32 +++++++++++++--
 4 files changed, 113 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 459769d..f8a8314 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -51,10 +51,39 @@ struct devs_id {
 enum intel_mid_cpu_type {
 	/* 1 was Moorestown */
 	INTEL_MID_CPU_CHIP_PENWELL = 2,
+	INTEL_MID_CPU_CHIP_CLOVERVIEW,
 };
 
 extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
 
+/**
+ * struct intel_mid_ops - Interface between intel-mid & sub archs
+ * @arch_setup: arch_setup function to re-initialize platform
+ *             structures (x86_init, x86_platform_init)
+ *
+ * This structure can be extended if any new interface is required
+ * between intel-mid & its sub arch files.
+ */
+struct intel_mid_ops {
+	void (*arch_setup)(void);
+};
+
+/* Helper API's for INTEL_MID_OPS_INIT */
+#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid)	\
+				[cpuid] = get_##cpuname##_ops
+
+/* Maximum number of CPU ops */
+#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *))
+
+/*
+ * For every new cpu addition, a weak get_<cpuname>_ops() function needs be
+ * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h.
+ */
+#define INTEL_MID_OPS_INIT {\
+	DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \
+	DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \
+};
+
 #ifdef CONFIG_X86_INTEL_MID
 
 static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void)
@@ -86,8 +115,21 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
  * Penwell uses spread spectrum clock, so the freq number is not exactly
  * the same as reported by MSR based on SDM.
  */
-#define PENWELL_FSB_FREQ_83SKU         83200
-#define PENWELL_FSB_FREQ_100SKU        99840
+#define FSB_FREQ_83SKU	83200
+#define FSB_FREQ_100SKU	99840
+#define FSB_FREQ_133SKU	133000
+
+#define FSB_FREQ_167SKU	167000
+#define FSB_FREQ_200SKU	200000
+#define FSB_FREQ_267SKU	267000
+#define FSB_FREQ_333SKU	333000
+#define FSB_FREQ_400SKU	400000
+
+/* Bus Select SoC Fuse value */
+#define BSEL_SOC_FUSE_MASK	0x7
+#define BSEL_SOC_FUSE_001	0x1 /* FSB 133MHz */
+#define BSEL_SOC_FUSE_101	0x5 /* FSB 100MHz */
+#define BSEL_SOC_FUSE_111	0x7 /* FSB 83MHz */
 
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 527d6d5..4095584 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -60,14 +60,27 @@
 
 enum intel_mid_timer_options intel_mid_timer_options;
 
+/* intel_mid_ops to store sub arch ops */
+struct intel_mid_ops *intel_mid_ops;
+/* getter function for sub arch ops*/
+static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT;
 enum intel_mid_cpu_type __intel_mid_cpu_chip;
 EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
 
+static void intel_mid_power_off(void)
+{
+};
+
 static void intel_mid_reboot(void)
 {
 	intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
 }
 
+static unsigned long __init intel_mid_calibrate_tsc(void)
+{
+	return 0;
+}
+
 static void __init intel_mid_time_init(void)
 {
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
@@ -92,13 +105,33 @@ static void __init intel_mid_time_init(void)
 
 static void intel_mid_arch_setup(void)
 {
-	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
-		__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
-	else {
+	if (boot_cpu_data.x86 != 6) {
 		pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
 			boot_cpu_data.x86, boot_cpu_data.x86_model);
 		__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
+		goto out;
+	}
+
+	switch (boot_cpu_data.x86_model) {
+	case 0x35:
+		__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW;
+		break;
+	case 0x27:
+	default:
+		__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
+		break;
 	}
+
+	if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops))
+		intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
+	else {
+		intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
+		pr_info("ARCH: Uknown SoC, assuming PENWELL!\n");
+	}
+
+out:
+	if (intel_mid_ops->arch_setup)
+		intel_mid_ops->arch_setup();
 }
 
 /* MID systems don't have i8042 controller */
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
index 519beb7..9ebce044 100644
--- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
+++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
@@ -11,5 +11,8 @@
 
 
 /* __attribute__((weak)) makes these declarations overridable */
-extern void intel_mid_power_off(void) __attribute__((weak));
-extern unsigned long __init intel_mid_calibrate_tsc(void) __attribute__((weak));
+/* For every CPU addition a new get_<cpuname>_ops interface needs
+ * to be added.
+ */
+extern void * __cpuinit get_penwell_ops(void) __attribute__((weak));
+extern void * __cpuinit get_cloverview_ops(void) __attribute__((weak));
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
index c7ff83c..4f7884e 100644
--- a/arch/x86/platform/intel-mid/mfld.c
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -15,11 +15,19 @@
 #include <asm/intel-mid.h>
 #include <asm/intel_mid_vrtc.h>
 
-void intel_mid_power_off(void)
+#include "intel_mid_weak_decls.h"
+
+static void penwell_arch_setup(void);
+/* penwell arch ops */
+static struct intel_mid_ops penwell_ops = {
+	.arch_setup = penwell_arch_setup,
+};
+
+static void mfld_power_off(void)
 {
 }
 
-unsigned long __init intel_mid_calibrate_tsc(void)
+static unsigned long __init mfld_calibrate_tsc(void)
 {
 	unsigned long fast_calibrate;
 	u32 lo, hi, ratio, fsb;
@@ -35,9 +43,9 @@ unsigned long __init intel_mid_calibrate_tsc(void)
 	}
 	rdmsr(MSR_FSB_FREQ, lo, hi);
 	if ((lo & 0x7) == 0x7)
-		fsb = PENWELL_FSB_FREQ_83SKU;
+		fsb = FSB_FREQ_83SKU;
 	else
-		fsb = PENWELL_FSB_FREQ_100SKU;
+		fsb = FSB_FREQ_100SKU;
 	fast_calibrate = ratio * fsb;
 	pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
 	lapic_timer_frequency = fsb * 1000 / HZ;
@@ -49,3 +57,19 @@ unsigned long __init intel_mid_calibrate_tsc(void)
 
 	return 0;
 }
+
+static void __init penwell_arch_setup()
+{
+	x86_platform.calibrate_tsc = mfld_calibrate_tsc;
+	pm_power_off = mfld_power_off;
+}
+
+void * __cpuinit get_penwell_ops()
+{
+	return &penwell_ops;
+}
+
+void * __cpuinit get_cloverview_ops()
+{
+	return &penwell_ops;
+}
-- 
cgit v1.1


From bc20aa48bbb3068224a1c91f8332971fdb689fad Mon Sep 17 00:00:00 2001
From: David Cohen <david.a.cohen@linux.intel.com>
Date: Mon, 16 Dec 2013 12:07:38 -0800
Subject: x86, intel-mid: Add Merrifield platform support

This code was partially based on Mark Brown's previous work.

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Link: http://lkml.kernel.org/r/1387224459-25746-4-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: Fei Yang <fei.yang@intel.com>
Cc: Mark F. Brown <mark.f.brown@intel.com>
Cc: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/intel-mid.h                   |   2 +
 arch/x86/pci/intel_mid_pci.c                       |   6 +-
 arch/x86/platform/intel-mid/Makefile               |   2 +-
 arch/x86/platform/intel-mid/intel-mid.c            |   4 +
 arch/x86/platform/intel-mid/intel_mid_weak_decls.h |   1 +
 arch/x86/platform/intel-mid/mrfl.c                 | 103 +++++++++++++++++++++
 arch/x86/platform/intel-mid/sfi.c                  |  34 +++++--
 7 files changed, 144 insertions(+), 8 deletions(-)
 create mode 100644 arch/x86/platform/intel-mid/mrfl.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index f8a8314..e34e097 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -52,6 +52,7 @@ enum intel_mid_cpu_type {
 	/* 1 was Moorestown */
 	INTEL_MID_CPU_CHIP_PENWELL = 2,
 	INTEL_MID_CPU_CHIP_CLOVERVIEW,
+	INTEL_MID_CPU_CHIP_TANGIER,
 };
 
 extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
@@ -82,6 +83,7 @@ struct intel_mid_ops {
 #define INTEL_MID_OPS_INIT {\
 	DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \
 	DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \
+	DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \
 };
 
 #ifdef CONFIG_X86_INTEL_MID
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 51384ca..84b9d67 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -31,6 +31,7 @@
 #include <asm/pci_x86.h>
 #include <asm/hw_irq.h>
 #include <asm/io_apic.h>
+#include <asm/intel-mid.h>
 
 #define PCIE_CAP_OFFSET	0x100
 
@@ -219,7 +220,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 	irq_attr.ioapic = mp_find_ioapic(dev->irq);
 	irq_attr.ioapic_pin = dev->irq;
 	irq_attr.trigger = 1; /* level */
-	irq_attr.polarity = 1; /* active low */
+	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
+		irq_attr.polarity = 0; /* active high */
+	else
+		irq_attr.polarity = 1; /* active low */
 	io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
 
 	return 0;
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 78a14ba..0a8ee70 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o
+obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o
 obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o
 
 # SFI specific code
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 4095584..1bbedc4 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -116,6 +116,10 @@ static void intel_mid_arch_setup(void)
 	case 0x35:
 		__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW;
 		break;
+	case 0x3C:
+	case 0x4A:
+		__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER;
+		break;
 	case 0x27:
 	default:
 		__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
index 9ebce044..a537ffc 100644
--- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
+++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
@@ -16,3 +16,4 @@
  */
 extern void * __cpuinit get_penwell_ops(void) __attribute__((weak));
 extern void * __cpuinit get_cloverview_ops(void) __attribute__((weak));
+extern void * __init get_tangier_ops(void) __attribute__((weak));
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
new file mode 100644
index 0000000..09d1015
--- /dev/null
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -0,0 +1,103 @@
+/*
+ * mrfl.c: Intel Merrifield platform specific setup code
+ *
+ * (C) Copyright 2013 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+
+#include <asm/apic.h>
+#include <asm/intel-mid.h>
+
+#include "intel_mid_weak_decls.h"
+
+static unsigned long __init tangier_calibrate_tsc(void)
+{
+	unsigned long fast_calibrate;
+	u32 lo, hi, ratio, fsb, bus_freq;
+
+	/* *********************** */
+	/* Compute TSC:Ratio * FSB */
+	/* *********************** */
+
+	/* Compute Ratio */
+	rdmsr(MSR_PLATFORM_INFO, lo, hi);
+	pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo);
+
+	ratio = (lo >> 8) & 0xFF;
+	pr_debug("ratio is %d\n", ratio);
+	if (!ratio) {
+		pr_err("Read a zero ratio, force tsc ratio to 4 ...\n");
+		ratio = 4;
+	}
+
+	/* Compute FSB */
+	rdmsr(MSR_FSB_FREQ, lo, hi);
+	pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n",
+			hi, lo);
+
+	bus_freq = lo & 0x7;
+	pr_debug("bus_freq = 0x%x\n", bus_freq);
+
+	if (bus_freq == 0)
+		fsb = FSB_FREQ_100SKU;
+	else if (bus_freq == 1)
+		fsb = FSB_FREQ_100SKU;
+	else if (bus_freq == 2)
+		fsb = FSB_FREQ_133SKU;
+	else if (bus_freq == 3)
+		fsb = FSB_FREQ_167SKU;
+	else if (bus_freq == 4)
+		fsb = FSB_FREQ_83SKU;
+	else if (bus_freq == 5)
+		fsb = FSB_FREQ_400SKU;
+	else if (bus_freq == 6)
+		fsb = FSB_FREQ_267SKU;
+	else if (bus_freq == 7)
+		fsb = FSB_FREQ_333SKU;
+	else {
+		BUG();
+		pr_err("Invalid bus_freq! Setting to minimal value!\n");
+		fsb = FSB_FREQ_100SKU;
+	}
+
+	/* TSC = FSB Freq * Resolved HFM Ratio */
+	fast_calibrate = ratio * fsb;
+	pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate);
+
+	/* ************************************ */
+	/* Calculate Local APIC Timer Frequency */
+	/* ************************************ */
+	lapic_timer_frequency = (fsb * 1000) / HZ;
+
+	pr_debug("Setting lapic_timer_frequency = %d\n",
+			lapic_timer_frequency);
+
+	/* mark tsc clocksource as reliable */
+	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
+
+	if (fast_calibrate)
+		return fast_calibrate;
+
+	return 0;
+}
+
+static void __init tangier_arch_setup(void)
+{
+	x86_platform.calibrate_tsc = tangier_calibrate_tsc;
+}
+
+/* tangier arch ops */
+static struct intel_mid_ops tangier_ops = {
+	.arch_setup = tangier_arch_setup,
+};
+
+void * __cpuinit get_tangier_ops()
+{
+	return &tangier_ops;
+}
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index c84c1ca..80a5228 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -443,13 +443,35 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 			 * so we have to enable them one by one here
 			 */
 			ioapic = mp_find_ioapic(irq);
-			irq_attr.ioapic = ioapic;
-			irq_attr.ioapic_pin = irq;
-			irq_attr.trigger = 1;
-			irq_attr.polarity = 1;
-			io_apic_set_pci_routing(NULL, irq, &irq_attr);
-		} else
+			if (ioapic >= 0) {
+				irq_attr.ioapic = ioapic;
+				irq_attr.ioapic_pin = irq;
+				irq_attr.trigger = 1;
+				if (intel_mid_identify_cpu() ==
+						INTEL_MID_CPU_CHIP_TANGIER) {
+					if (!strncmp(pentry->name,
+							"r69001-ts-i2c", 13))
+						/* active low */
+						irq_attr.polarity = 1;
+					else if (!strncmp(pentry->name,
+							"synaptics_3202", 14))
+						/* active low */
+						irq_attr.polarity = 1;
+					else if (irq == 41)
+						/* fast_int_1 */
+						irq_attr.polarity = 1;
+					else
+						/* active high */
+						irq_attr.polarity = 0;
+				} else {
+					/* PNW and CLV go with active low */
+					irq_attr.polarity = 1;
+				}
+				io_apic_set_pci_routing(NULL, irq, &irq_attr);
+			}
+		} else {
 			irq = 0; /* No irq */
+		}
 
 		dev = get_device_id(pentry->type, pentry->name);
 
-- 
cgit v1.1


From 4cb9b00f42e07830310319a07e6c91413ee8153e Mon Sep 17 00:00:00 2001
From: David Cohen <david.a.cohen@linux.intel.com>
Date: Mon, 16 Dec 2013 17:37:26 -0800
Subject: x86, intel-mid: Remove deprecated X86_MDFLD and X86_WANT_INTEL_MID
 configs

We want to support all Intel MID (Mobile Internet Device) platforms
with a single config selection. This patch removes deprecated
CONFIG_X86_MDFLD and X86_WANT_INTEL_MID options in favor of having
CONFIG_X86_INTEL_MID only.

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Link: http://lkml.kernel.org/r/1387244246-20714-1-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0952ecd..edcf80e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -438,42 +438,26 @@ config X86_INTEL_CE
 	  This option compiles in support for the CE4100 SOC for settop
 	  boxes and media devices.
 
-config X86_WANT_INTEL_MID
+config X86_INTEL_MID
 	bool "Intel MID platform support"
 	depends on X86_32
 	depends on X86_EXTENDED_PLATFORM
-	---help---
-	  Select to build a kernel capable of supporting Intel MID platform
-	  systems which do not have the PCI legacy interfaces (Moorestown,
-	  Medfield). If you are building for a PC class system say N here.
-
-if X86_WANT_INTEL_MID
-
-config X86_INTEL_MID
-	bool
-
-config X86_MDFLD
-       bool "Medfield MID platform"
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_IO_APIC
-	select X86_INTEL_MID
 	select SFI
+	select I2C
 	select DW_APB_TIMER
 	select APB_TIMER
-	select I2C
-	select SPI
 	select INTEL_SCU_IPC
-	select X86_PLATFORM_DEVICES
 	select MFD_INTEL_MSIC
 	---help---
-	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
-	  Internet Device(MID) platform. 
-	  Unlike standard x86 PCs, Medfield does not have many legacy devices
-	  nor standard legacy replacement devices/features. e.g. Medfield does
-	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+	  Select to build a kernel capable of supporting Intel MID (Mobile
+	  Internet Device) platform systems which do not have the PCI legacy
+	  interfaces. If you are building for a PC class system say N here.
 
-endif
+	  Intel MID platforms are based on an Intel processor and chipset which
+	  consume less power than most of the x86 derivatives.
 
 config X86_INTEL_LPSS
 	bool "Intel Low Power Subsystem Support"
-- 
cgit v1.1


From aee636c4809fa54848ff07a899b326eb1f9987a2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 15 Jan 2014 06:50:07 -0800
Subject: bpf: do not use reciprocal divide

At first Jakub Zawadzki noticed that some divisions by reciprocal_divide
were not correct. (off by one in some cases)
http://www.wireshark.org/~darkjames/reciprocal-buggy.c

He could also show this with BPF:
http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c

The reciprocal divide in linux kernel is not generic enough,
lets remove its use in BPF, as it is not worth the pain with
current cpus.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Cc: Mircea Gherzan <mgherzan@gmail.com>
Cc: Daniel Borkmann <dxchgb@gmail.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Matt Evans <matt@ozlabs.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26328e8..4ed75dd 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -359,15 +359,21 @@ void bpf_jit_compile(struct sk_filter *fp)
 				EMIT2(0x89, 0xd0);	/* mov %edx,%eax */
 				break;
 			case BPF_S_ALU_MOD_K: /* A %= K; */
+				if (K == 1) {
+					CLEAR_A();
+					break;
+				}
 				EMIT2(0x31, 0xd2);	/* xor %edx,%edx */
 				EMIT1(0xb9);EMIT(K, 4);	/* mov imm32,%ecx */
 				EMIT2(0xf7, 0xf1);	/* div %ecx */
 				EMIT2(0x89, 0xd0);	/* mov %edx,%eax */
 				break;
-			case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
-				EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
-				EMIT(K, 4);
-				EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
+			case BPF_S_ALU_DIV_K: /* A /= K */
+				if (K == 1)
+					break;
+				EMIT2(0x31, 0xd2);	/* xor %edx,%edx */
+				EMIT1(0xb9);EMIT(K, 4);	/* mov imm32,%ecx */
+				EMIT2(0xf7, 0xf1);	/* div %ecx */
 				break;
 			case BPF_S_ALU_AND_X:
 				seen |= SEEN_XREG;
-- 
cgit v1.1


From 7da7c1561366ba8adb7275464ab44e84e1faa7e0 Mon Sep 17 00:00:00 2001
From: Bin Gao <bin.gao@intel.com>
Date: Mon, 21 Oct 2013 09:16:33 -0700
Subject: x86, tsc: Add static (MSR) TSC calibration on Intel Atom SoCs

On SoCs that have the calibration MSRs available, either there is no
PIT, HPET or PMTIMER to calibrate against, or the PIT/HPET/PMTIMER is
driven from the same clock as the TSC, so calibration is redundant and
just slows down the boot.

TSC rate is caculated by this formula:
<maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
The ratio and the resolved frequency ID can be obtained from MSR.
See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
for details.

Signed-off-by: Bin Gao <bin.gao@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-rgm7xmg7k6qnjlw3ynkcjsmh@git.kernel.org
---
 arch/x86/include/asm/tsc.h |   3 ++
 arch/x86/kernel/Makefile   |   2 +-
 arch/x86/kernel/tsc.c      |  10 ++++
 arch/x86/kernel/tsc_msr.c  | 125 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/tsc_msr.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 235be70..57ae63c 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -65,4 +65,7 @@ extern int notsc_setup(char *);
 extern void tsc_save_sched_clock_state(void);
 extern void tsc_restore_sched_clock_state(void);
 
+/* MSR based TSC calibration for Intel Atom SoC platforms */
+int try_msr_calibrate_tsc(unsigned long *fast_calibrate);
+
 #endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index dbe9bd6..6dbbb1e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
-obj-y			+= tsc.o io_delay.o rtc.o
+obj-y			+= tsc.o tsc_msr.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d4..e574716 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -419,6 +419,16 @@ unsigned long native_calibrate_tsc(void)
 	unsigned long flags, latch, ms, fast_calibrate;
 	int hpet = is_hpet_enabled(), i, loopmin;
 
+	/* Calibrate TSC using MSR for Intel Atom SoCs */
+	local_irq_save(flags);
+	i = try_msr_calibrate_tsc(&fast_calibrate);
+	local_irq_restore(flags);
+	if (i >= 0) {
+		if (i == 0)
+			pr_warn("Fast TSC calibration using MSR failed\n");
+		return fast_calibrate;
+	}
+
 	local_irq_save(flags);
 	fast_calibrate = quick_pit_calibrate();
 	local_irq_restore(flags);
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 0000000..c502772
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,125 @@
+/*
+ * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
+ *
+ * TSC in Intel Atom SoC runs at a constant rate which can be figured
+ * by this formula:
+ * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
+ * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
+ * for details.
+ * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
+ * based calibration is the only option.
+ *
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/param.h>
+
+/* CPU reference clock frequency: in KHz */
+#define FREQ_83		83200
+#define FREQ_100	99840
+#define FREQ_133	133200
+#define FREQ_166	166400
+
+#define MAX_NUM_FREQS	8
+
+/*
+ * According to Intel 64 and IA-32 System Programming Guide,
+ * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field msr_plat does.
+ */
+struct freq_desc {
+	u8 x86_family;	/* CPU family */
+	u8 x86_model;	/* model */
+	u8 msr_plat;	/* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
+	u32 freqs[MAX_NUM_FREQS];
+};
+
+static struct freq_desc freq_desc_tables[] = {
+	/* PNW */
+	{ 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+	/* CLV+ */
+	{ 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+	/* TNG */
+	{ 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
+	/* VLV2 */
+	{ 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
+	/* ANN */
+	{ 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
+};
+
+static int match_cpu(u8 family, u8 model)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
+		if ((family == freq_desc_tables[i].x86_family) &&
+			(model == freq_desc_tables[i].x86_model))
+			return i;
+	}
+
+	return -1;
+}
+
+/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
+#define id_to_freq(cpu_index, freq_id) \
+	(freq_desc_tables[cpu_index].freqs[freq_id])
+
+/*
+ * Do MSR calibration only for known/supported CPUs.
+ * Return values:
+ * -1: CPU is unknown/unsupported for MSR based calibration
+ *  0: CPU is known/supported, but calibration failed
+ *  1: CPU is known/supported, and calibration succeeded
+ */
+int try_msr_calibrate_tsc(unsigned long *fast_calibrate)
+{
+	int cpu_index;
+	u32 lo, hi, ratio, freq_id, freq;
+
+	cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
+	if (cpu_index < 0)
+		return -1;
+
+	*fast_calibrate = 0;
+
+	if (freq_desc_tables[cpu_index].msr_plat) {
+		rdmsr(MSR_PLATFORM_INFO, lo, hi);
+		ratio = (lo >> 8) & 0x1f;
+	} else {
+		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+		ratio = (hi >> 8) & 0x1f;
+	}
+	pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
+
+	if (!ratio)
+		return 0;
+
+	/* Get FSB FREQ ID */
+	rdmsr(MSR_FSB_FREQ, lo, hi);
+	freq_id = lo & 0x7;
+	freq = id_to_freq(cpu_index, freq_id);
+	pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
+				freq_id, freq);
+	if (!freq)
+		return 0;
+
+	/* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+	*fast_calibrate = freq * ratio;
+	pr_info("TSC runs at %lu KHz\n", *fast_calibrate);
+
+	lapic_timer_frequency = (freq * 1000) / HZ;
+	pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
+
+	return 1;
+}
-- 
cgit v1.1


From c026b3591e4f2a4993df773183704bb31634e0bd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 10 Jan 2014 21:06:03 +0100
Subject: x86, mm, perf: Allow recursive faults from interrupts

Waiman managed to trigger a PMI while in a emulate_vsyscall() fault,
the PMI in turn managed to trigger a fault while obtaining a stack
trace. This triggered the sig_on_uaccess_error recursive fault logic
and killed the process dead.

Fix this by explicitly excluding interrupts from the recursive fault
logic.

Reported-and-Tested-by: Waiman Long <waiman.long@hp.com>
Fixes: e00b12e64be9 ("perf/x86: Further optimize copy_from_user_nmi()")
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140110200603.GJ7572@laptop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9ff85bb..9d591c8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -641,6 +641,20 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs)) {
+		/*
+		 * Any interrupt that takes a fault gets the fixup. This makes
+		 * the below recursive fault logic only apply to a faults from
+		 * task context.
+		 */
+		if (in_interrupt())
+			return;
+
+		/*
+		 * Per the above we're !in_interrupt(), aka. task context.
+		 *
+		 * In this case we need to make sure we're not recursively
+		 * faulting through the emulate_vsyscall() logic.
+		 */
 		if (current_thread_info()->sig_on_uaccess_error && signal) {
 			tsk->thread.trap_nr = X86_TRAP_PF;
 			tsk->thread.error_code = error_code | PF_USER;
@@ -649,6 +663,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 			/* XXX: hwpoison faults will set the wrong code. */
 			force_sig_info_fault(signal, si_code, address, tsk, 0);
 		}
+
+		/*
+		 * Barring that, we can do the fixup and be happy.
+		 */
 		return;
 	}
 
-- 
cgit v1.1


From bee09ed91cacdbffdbcd3b05de8409c77ec9fcd6 Mon Sep 17 00:00:00 2001
From: Robert Richter <rric@kernel.org>
Date: Wed, 15 Jan 2014 15:57:29 +0100
Subject: perf/x86/amd/ibs: Fix waking up from S3 for AMD family 10h

On AMD family 10h we see following error messages while waking up from
S3 for all non-boot CPUs leading to a failed IBS initialization:

 Enabling non-boot CPUs ...
 smpboot: Booting Node 0 Processor 1 APIC 0x1
 [Firmware Bug]: cpu 1, try to use APIC500 (LVT offset 0) for vector 0x400, but the register is already in use for vector 0xf9 on another cpu
 perf: IBS APIC setup failed on cpu #1
 process: Switch to broadcast mode on CPU1
 CPU1 is up
 ...
 ACPI: Waking up from system sleep state S3

Reason for this is that during suspend the LVT offset for the IBS
vector gets lost and needs to be reinialized while resuming.

The offset is read from the IBSCTL msr. On family 10h the offset needs
to be 1 as offset 0 is used for the MCE threshold interrupt, but
firmware assings it for IBS to 0 too. The kernel needs to reprogram
the vector. The msr is a readonly node msr, but a new value can be
written via pci config space access. The reinitialization is
implemented for family 10h in setup_ibs_ctl() which is forced during
IBS setup.

This patch fixes IBS setup after waking up from S3 by adding
resume/supend hooks for the boot cpu which does the offset
reinitialization.

Marking it as stable to let distros pick up this fix.

Signed-off-by: Robert Richter <rric@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org> v3.2..
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1389797849-5565-1-git-send-email-rric.net@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 53 +++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index e09f0bf..4b8e4d3 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/apic.h>
 
@@ -816,6 +817,18 @@ out:
 	return ret;
 }
 
+static void ibs_eilvt_setup(void)
+{
+	/*
+	 * Force LVT offset assignment for family 10h: The offsets are
+	 * not assigned by the BIOS for this family, so the OS is
+	 * responsible for doing it. If the OS assignment fails, fall
+	 * back to BIOS settings and try to setup this.
+	 */
+	if (boot_cpu_data.x86 == 0x10)
+		force_ibs_eilvt_setup();
+}
+
 static inline int get_ibs_lvt_offset(void)
 {
 	u64 val;
@@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy)
 		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
 }
 
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+	clear_APIC_ibs(NULL);
+	return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+	ibs_eilvt_setup();
+	setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+	.resume		= perf_ibs_resume,
+	.suspend	= perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+	register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
 static int
 perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
@@ -877,18 +920,12 @@ static __init int amd_ibs_init(void)
 	if (!caps)
 		return -ENODEV;	/* ibs not supported by the cpu */
 
-	/*
-	 * Force LVT offset assignment for family 10h: The offsets are
-	 * not assigned by the BIOS for this family, so the OS is
-	 * responsible for doing it. If the OS assignment fails, fall
-	 * back to BIOS settings and try to setup this.
-	 */
-	if (boot_cpu_data.x86 == 0x10)
-		force_ibs_eilvt_setup();
+	ibs_eilvt_setup();
 
 	if (!ibs_eilvt_valid())
 		goto out;
 
+	perf_ibs_pm_init();
 	get_online_cpus();
 	ibs_caps = caps;
 	/* make ibs_caps visible to other cpus: */
-- 
cgit v1.1


From ca1e631c3acf80bc5f5934ce9054a9b4880c96e1 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 16 Jan 2014 13:00:21 -0800
Subject: x86, tsc, apic: Unbreak static (MSR) calibration when
 CONFIG_X86_LOCAL_APIC=n

If we aren't going to use the local APIC anyway, we obviously don't
care about its timer frequency.

Link: http://lkml.kernel.org/r/tip-rgm7xmg7k6qnjlw3ynkcjsmh@git.kernel.org
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Bin Gao <bin.gao@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/tsc_msr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index c502772..8b5434f 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -118,8 +118,10 @@ int try_msr_calibrate_tsc(unsigned long *fast_calibrate)
 	*fast_calibrate = freq * ratio;
 	pr_info("TSC runs at %lu KHz\n", *fast_calibrate);
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	lapic_timer_frequency = (freq * 1000) / HZ;
 	pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
+#endif
 
 	return 1;
 }
-- 
cgit v1.1


From acb20d7395f75b0fdb5b300f9fb3b9c6054fb04f Mon Sep 17 00:00:00 2001
From: David Cohen <david.a.cohen@linux.intel.com>
Date: Thu, 16 Jan 2014 15:07:02 -0800
Subject: x86, intel-mid: sfi_handle_*_dev() should check for pdata error code

When Intel MID finds a match between SFI table from FW and registered
SFI devices, it will always register a device regardless the platform
code was successful or not.

This patch adds an extra option for platform code to return error code
and abort device registration on SFI table parsing.

This patch does not contain any functional changes for current intel
mid platform code.

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Link: http://lkml.kernel.org/r/1389913624-9149-2-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/intel-mid/sfi.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 80a5228..1d5d20b 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -337,6 +337,8 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
 	pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
 		pentry->name, pentry->irq);
 	pdata = intel_mid_sfi_get_pdata(dev, pentry);
+	if (IS_ERR(pdata))
+		return;
 
 	pdev = platform_device_alloc(pentry->name, 0);
 	if (pdev == NULL) {
@@ -370,6 +372,8 @@ static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
 		spi_info.chip_select);
 
 	pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
+	if (IS_ERR(pdata))
+		return;
 
 	spi_info.platform_data = pdata;
 	if (dev->delay)
@@ -395,6 +399,8 @@ static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
 		i2c_info.addr);
 	pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
 	i2c_info.platform_data = pdata;
+	if (IS_ERR(pdata))
+		return;
 
 	if (dev->delay)
 		intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
-- 
cgit v1.1


From a957a14bb4ca976cbaaff3594ef5b8f8f7f65804 Mon Sep 17 00:00:00 2001
From: David Cohen <david.a.cohen@linux.intel.com>
Date: Thu, 16 Jan 2014 15:07:03 -0800
Subject: x86, intel-mid: Check get_gpio_by_name() error code on platform code

This patch does cleanup on all intel mid platform code that uses
gpio_get_by_name() function. From now on they should check for any error
code instead of only hardcoded -1.

There are no functional changes from this change.

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Link: http://lkml.kernel.org/r/1389913624-9149-3-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/intel-mid/device_libs/platform_emc1403.c   | 4 +++-
 arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_lis331.c    | 4 +++-
 arch/x86/platform/intel-mid/device_libs/platform_max7315.c   | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c   | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_tca6416.c   | 4 ++--
 7 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
index 0d942c1..69a7836 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -22,7 +22,9 @@ static void __init *emc1403_platform_data(void *info)
 	int intr = get_gpio_by_name("thermal_int");
 	int intr2nd = get_gpio_by_name("thermal_alert");
 
-	if (intr == -1 || intr2nd == -1)
+	if (intr < 0)
+		return NULL;
+	if (intr2nd < 0)
 		return NULL;
 
 	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
index a013a48..dccae6b 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -66,7 +66,7 @@ static int __init pb_keys_init(void)
 		gb[i].gpio = get_gpio_by_name(gb[i].desc);
 		pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
 					gb[i].gpio);
-		if (gb[i].gpio == -1)
+		if (gb[i].gpio < 0)
 			continue;
 
 		if (i != good)
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
index 15278c1..54226de 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -21,7 +21,9 @@ static void __init *lis331dl_platform_data(void *info)
 	int intr = get_gpio_by_name("accel_int");
 	int intr2nd = get_gpio_by_name("accel_2");
 
-	if (intr == -1 || intr2nd == -1)
+	if (intr < 0)
+		return NULL;
+	if (intr2nd < 0)
 		return NULL;
 
 	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
index 94ade10..2c8acbc 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -48,7 +48,7 @@ static void __init *max7315_platform_data(void *info)
 	gpio_base = get_gpio_by_name(base_pin_name);
 	intr = get_gpio_by_name(intr_pin_name);
 
-	if (gpio_base == -1)
+	if (gpio_base < 0)
 		return NULL;
 	max7315->gpio_base = gpio_base;
 	if (intr != -1) {
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
index dd28d63..cfe9a47 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -19,7 +19,7 @@ static void *mpu3050_platform_data(void *info)
 	struct i2c_board_info *i2c_info = info;
 	int intr = get_gpio_by_name("mpu3050_int");
 
-	if (intr == -1)
+	if (intr < 0)
 		return NULL;
 
 	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
index d87182a..65c2a9a 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
@@ -26,7 +26,7 @@ static void __init *pmic_gpio_platform_data(void *info)
 	static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
 	int gpio_base = get_gpio_by_name("pmic_gpio_base");
 
-	if (gpio_base == -1)
+	if (gpio_base < 0)
 		gpio_base = 64;
 	pmic_gpio_pdata.gpio_base = gpio_base;
 	pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
index 22881c9..33be0b3 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -34,10 +34,10 @@ static void *tca6416_platform_data(void *info)
 	gpio_base = get_gpio_by_name(base_pin_name);
 	intr = get_gpio_by_name(intr_pin_name);
 
-	if (gpio_base == -1)
+	if (gpio_base < 0)
 		return NULL;
 	tca6416.gpio_base = gpio_base;
-	if (intr != -1) {
+	if (intr >= 0) {
 		i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
 		tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
 	} else {
-- 
cgit v1.1


From 28c6a39b3367f29462cd586785dc445cd6b5ac23 Mon Sep 17 00:00:00 2001
From: David Cohen <david.a.cohen@linux.intel.com>
Date: Thu, 16 Jan 2014 15:07:04 -0800
Subject: x86, intel-mid: Return proper error code from get_gpio_by_name()

This patch cleans up get_gpio_by_name() to return an error code
instead of hardcoded -1.

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Link: http://lkml.kernel.org/r/1389913624-9149-4-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/intel-mid/sfi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 1d5d20b..438306e 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -224,7 +224,7 @@ int get_gpio_by_name(const char *name)
 		if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
 			return pentry->pin_no;
 	}
-	return -1;
+	return -EINVAL;
 }
 
 void __init intel_scu_device_register(struct platform_device *pdev)
-- 
cgit v1.1


From ee87c751d88f9b03fee7349556817fe80c113b32 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Thu, 16 Jan 2014 16:13:08 -0800
Subject: x86, intel_mid: Replace memcpy with struct assignment

This is a cleanup proposed by coccinelle. It replaces memcpy with struct
assignment on intel-mid's sfi layer.

Generated by: coccinelle/misc/memcpy-assign.cocci

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Link: http://lkml.kernel.org/r/1389917588-9785-1-git-send-email-david.a.cohen@linux.intel.com
Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/intel-mid/sfi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 438306e..994c40b 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -250,7 +250,7 @@ static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
 			sdev->modalias);
 		return;
 	}
-	memcpy(new_dev, sdev, sizeof(*sdev));
+	*new_dev = *sdev;
 
 	spi_devs[spi_next_dev++] = new_dev;
 }
@@ -271,7 +271,7 @@ static void __init intel_scu_i2c_device_register(int bus,
 			idev->type);
 		return;
 	}
-	memcpy(new_dev, idev, sizeof(*idev));
+	*new_dev = *idev;
 
 	i2c_bus[i2c_next_dev] = bus;
 	i2c_devs[i2c_next_dev++] = new_dev;
-- 
cgit v1.1


From fb09b78151361f5001ad462e4b242b10845830e2 Mon Sep 17 00:00:00 2001
From: Qiaowei Ren <qiaowei.ren@intel.com>
Date: Sun, 12 Jan 2014 17:20:02 +0800
Subject: x86, mpx: Add MPX related opcodes to the x86 opcode map

This patch adds all the MPX instructions to x86 opcode map, so the x86
instruction decoder can decode MPX instructions.

Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Link: http://lkml.kernel.org/r/1389518403-7715-4-git-send-email-qiaowei.ren@intel.com
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/x86-opcode-map.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 533a85e..1a2be7c 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -346,8 +346,8 @@ AVXcode: 1
 17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
 18: Grp16 (1A)
 19:
-1a:
-1b:
+1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv
+1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv
 1c:
 1d:
 1e:
-- 
cgit v1.1


From 741e3902cd89a7fbc04ae53f29a7ca0da452aa8e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 20 Jan 2014 19:51:05 +0100
Subject: x86/intel/mpx: Remove unused LWP structure

We don't support LWP yet, don't give the impression that we do:
represent the LWP state as opaque 128 bytes, the way Linux sees it
currently.

Cc: Qiaowei Ren <qiaowei.ren@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-ecarmjtfKpanpAapfck6dj6g@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b7845a1..865c399 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -370,15 +370,9 @@ struct ymmh_struct {
 	u32 ymmh_space[64];
 };
 
+/* We don't support LWP yet: */
 struct lwp_struct {
-	u64 lwpcb_addr;
-	u32 flags;
-	u32 buf_head_offset;
-	u64 buf_base;
-	u32 buf_size;
-	u32 filters;
-	u64 saved_event_record[4];
-	u32 event_counter[16];
+	u8 reserved[128];
 };
 
 struct bndregs_struct {
-- 
cgit v1.1