From c4d017f21328c269deba3c7acde873204fe595b5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 28 May 2011 10:36:22 -0700
Subject: x86: Convert vmalloc()+memset() to vzalloc()

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <trivial@kernel.org>
Link: http://lkml.kernel.org/r/10e35243fda0b8739c89ac32a7bdf348ec4752e1.1306603968.git.joe@perches.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr-test.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index e1d1069..b008656 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -123,12 +123,11 @@ static int pageattr_test(void)
 	if (print)
 		printk(KERN_INFO "CPA self-test:\n");
 
-	bm = vmalloc((max_pfn_mapped + 7) / 8);
+	bm = vzalloc((max_pfn_mapped + 7) / 8);
 	if (!bm) {
 		printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
 		return -ENOMEM;
 	}
-	memset(bm, 0, (max_pfn_mapped + 7) / 8);
 
 	failed += print_split(&sa);
 	srandom32(100);
-- 
cgit v1.1


From 395810627b6a43c8d0ec948884043946fa162308 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 8 Jun 2011 16:09:21 +0900
Subject: x86: Swap save_stack_trace_regs parameters

Swap the 1st and 2nd parameters of save_stack_trace_regs()
as same as the parameters of save_stack_trace_tsk().

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Namhyung Kim <namhyung@gmail.com>
Link: http://lkml.kernel.org/r/20110608070921.17777.31103.stgit@fedora15
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/mm/kmemcheck/error.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 704a37c..dab4187 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
 	e->trace.entries = e->trace_entries;
 	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
 	e->trace.skip = 0;
-	save_stack_trace_regs(&e->trace, regs);
+	save_stack_trace_regs(regs, &e->trace);
 
 	/* Round address down to nearest 16 bytes */
 	shadow_copy = kmemcheck_shadow_lookup(address
-- 
cgit v1.1


From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 27 Jun 2011 14:41:57 +0200
Subject: perf: Remove the nmi parameter from the swevent and overflow
 interface

The nmi parameter indicated if we could do wakeups from the current
context, if not, we would set some state and self-IPI and let the
resulting interrupt do the wakeup.

For the various event classes:

  - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from
    the PMI-tail (ARM etc.)
  - tracepoint: nmi=0; since tracepoint could be from NMI context.
  - software: nmi=[0,1]; some, like the schedule thing cannot
    perform wakeups, and hence need 0.

As one can see, there is very little nmi=1 usage, and the down-side of
not using it is that on some platforms some software events can have a
jiffy delay in wakeup (when arch_irq_work_raise isn't implemented).

The up-side however is that we can remove the nmi parameter and save a
bunch of conditionals in fast paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Michael Cree <mcree@orcon.net.nz>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2dbf6bf..4d09df0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1161,11 +1161,11 @@ good_area:
 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
 		if (fault & VM_FAULT_MAJOR) {
 			tsk->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 				      regs, address);
 		} else {
 			tsk->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 				      regs, address);
 		}
 		if (fault & VM_FAULT_RETRY) {
-- 
cgit v1.1


From a63fdc5156f2ef5690b6cf03d72b0c4917efbba7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 14 Jun 2011 10:57:50 +1000
Subject: mm: Move definition of MIN_MEMORY_BLOCK_SIZE to a header

The macro MIN_MEMORY_BLOCK_SIZE is currently defined twice in two .c
files, and I need it in a third one to fix a powerpc bug, so let's
first move it into a header

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d865c4ae..bbaaa00 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -28,6 +28,7 @@
 #include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
+#include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nmi.h>
 #include <linux/gfp.h>
@@ -895,8 +896,6 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 }
 
 #ifdef CONFIG_X86_UV
-#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
-
 unsigned long memory_block_size_bytes(void)
 {
 	if (is_uv_system()) {
-- 
cgit v1.1


From 60063497a95e716c9a689af3be2687d261f115b4 Mon Sep 17 00:00:00 2001
From: Arun Sharma <asharma@fb.com>
Date: Tue, 26 Jul 2011 16:09:06 -0700
Subject: atomic: use <linux/atomic.h>

This allows us to move duplicated code in <asm/atomic.h>
(atomic_inc_not_zero() for now) to <linux/atomic.h>

Signed-off-by: Arun Sharma <asharma@fb.com>
Reviewed-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/mmio-mod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3adff7d..67421f3 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -34,7 +34,7 @@
 #include <asm/pgtable.h>
 #include <linux/mmiotrace.h>
 #include <asm/e820.h> /* for ISA_START_ADDRESS */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 
-- 
cgit v1.1


From 318f5a2a672152328c9fb4dead504b89ec738a43 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@MIT.EDU>
Date: Wed, 3 Aug 2011 09:31:53 -0400
Subject: x86-64: Add user_64bit_mode paravirt op

Three places in the kernel assume that the only long mode CPL 3
selector is __USER_CS.  This is not true on Xen -- Xen's sysretq
changes cs to the magic value 0xe033.

Two of the places are corner cases, but as of "x86-64: Improve
vsyscall emulation CS and RIP handling"
(c9712944b2a12373cb6ff8059afcfb7e826a6c54), vsyscalls will segfault
if called with Xen's extra CS selector.  This causes a panic when
older init builds die.

It seems impossible to make Xen use __USER_CS reliably without
taking a performance hit on every system call, so this fixes the
tests instead with a new paravirt op.  It's a little ugly because
ptrace.h can't include paravirt.h.

Signed-off-by: Andy Lutomirski <luto@mit.edu>
Link: http://lkml.kernel.org/r/f4fcb3947340d9e96ce1054a432f183f9da9db83.1312378163.git.luto@mit.edu
Reported-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2dbf6bf..c1d0182 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -105,7 +105,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
 		 * but for now it's good enough to assume that long
 		 * mode only uses well known segments or kernel.
 		 */
-		return (!user_mode(regs)) || (regs->cs == __USER_CS);
+		return (!user_mode(regs) || user_64bit_mode(regs));
 #endif
 	case 0x60:
 		/* 0x64 thru 0x67 are valid prefixes in all modes. */
-- 
cgit v1.1


From dfb09f9b7ab03fd367740e541a5caf830ed56726 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 5 Aug 2011 15:15:08 +0200
Subject: x86, amd: Avoid cache aliasing penalties on AMD family 15h

This patch provides performance tuning for the "Bulldozer" CPU. With its
shared instruction cache there is a chance of generating an excessive
number of cache cross-invalidates when running specific workloads on the
cores of a compute module.

This excessive amount of cross-invalidations can be observed if cache
lines backed by shared physical memory alias in bits [14:12] of their
virtual addresses, as those bits are used for the index generation.

This patch addresses the issue by clearing all the bits in the [14:12]
slice of the file mapping's virtual address at generation time, thus
forcing those bits the same for all mappings of a single shared library
across processes and, in doing so, avoids instruction cache aliases.

It also adds the command line option "align_va_addr=(32|64|on|off)" with
which virtual address alignment can be enabled for 32-bit or 64-bit x86
individually, or both, or be completely disabled.

This change leaves virtual region address allocation on other families
and/or vendors unaffected.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/1312550110-24160-2-git-send-email-bp@amd64.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/mmap.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..d4c0736 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -51,21 +51,6 @@ static unsigned int stack_maxrandom_size(void)
 #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
 #define MAX_GAP (TASK_SIZE/6*5)
 
-/*
- * True on X86_32 or when emulating IA32 on X86_64
- */
-static int mmap_is_ia32(void)
-{
-#ifdef CONFIG_X86_32
-	return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_IA32))
-		return 1;
-#endif
-	return 0;
-}
-
 static int mmap_is_legacy(void)
 {
 	if (current->personality & ADDR_COMPAT_LAYOUT)
-- 
cgit v1.1


From 9387f774d61b01ab71bade85e6d0bfab0b3419bd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Sat, 6 Aug 2011 14:31:38 +0200
Subject: x86-32, amd: Move va_align definition to unbreak 32-bit build

hpa reported that dfb09f9b7ab03fd367740e541a5caf830ed56726 breaks 32-bit
builds with the following error message:

/home/hpa/kernel/linux-tip.cpu/arch/x86/kernel/cpu/amd.c:437: undefined
reference to `va_align'
/home/hpa/kernel/linux-tip.cpu/arch/x86/kernel/cpu/amd.c:436: undefined
reference to `va_align'

This is due to the fact that va_align is a global in a 64-bit only
compilation unit. Move it to mmap.c where it is visible to both
subarches.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/1312633899-1131-1-git-send-email-bp@amd64.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/mmap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index d4c0736..4b5ba85 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -31,6 +31,10 @@
 #include <linux/sched.h>
 #include <asm/elf.h>
 
+struct __read_mostly va_alignment va_align = {
+	.flags = -1,
+};
+
 static unsigned int stack_maxrandom_size(void)
 {
 	unsigned int max = 0;
@@ -42,7 +46,6 @@ static unsigned int stack_maxrandom_size(void)
 	return max;
 }
 
-
 /*
  * Top of mmap area (just below the process stack).
  *
-- 
cgit v1.1


From 3ae36655b97a03fa1decf72f04078ef945647c1a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@mit.edu>
Date: Wed, 10 Aug 2011 11:15:32 -0400
Subject: x86-64: Rework vsyscall emulation and add vsyscall= parameter

There are three choices:

vsyscall=native: Vsyscalls are native code that issues the
corresponding syscalls.

vsyscall=emulate (default): Vsyscalls are emulated by instruction
fault traps, tested in the bad_area path.  The actual contents of
the vsyscall page is the same as the vsyscall=native case except
that it's marked NX.  This way programs that make assumptions about
what the code in the page does will not be confused when they read
that code.

vsyscall=none: Trying to execute a vsyscall will segfault.

Signed-off-by: Andy Lutomirski <luto@mit.edu>
Link: http://lkml.kernel.org/r/8449fb3abf89851fd6b2260972666a6f82542284.1312988155.git.luto@mit.edu
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/fault.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c1d0182..e58935c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -720,6 +720,18 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		if (is_errata100(regs, address))
 			return;
 
+#ifdef CONFIG_X86_64
+		/*
+		 * Instruction fetch faults in the vsyscall page might need
+		 * emulation.
+		 */
+		if (unlikely((error_code & PF_INSTR) &&
+			     ((address & ~0xfff) == VSYSCALL_START))) {
+			if (emulate_vsyscall(regs, address))
+				return;
+		}
+#endif
+
 		if (unlikely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
 
-- 
cgit v1.1


From cedf03bd9aa54d1d7a9065dddc9e76505f476b12 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 15 Aug 2011 10:18:46 -0700
Subject: x86: fix mm/fault.c build

arch/x86/mm/fault.c needs to include asm/vsyscall.h to fix a
build error:

  arch/x86/mm/fault.c: In function '__bad_area_nosemaphore':
  arch/x86/mm/fault.c:728: error: 'VSYSCALL_START' undeclared (first use in this function)

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/fault.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 247aae3..0d17c8c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -17,6 +17,7 @@
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
+#include <asm/vsyscall.h>
 
 /*
  * Page fault error code bits:
-- 
cgit v1.1


From fab1167c4698e3ff11ebb06281d78def6c53728b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 15 Aug 2011 22:28:56 -0700
Subject: x86, vsyscall: Add missing <asm/fixmap.h> to arch/x86/mm/fault.c

arch/x86/mm/fault.c now depend on having the symbol VSYSCALL_START
defined, which is best handled by including <asm/fixmap.h> (it isn't
unreasonable we may want other fixed addresses in this file in the
future, and so it is cleaner than including <asm/vsyscall.h>
directly.)

This addresses an x86-64 allnoconfig build failure.  On other
configurations it was masked by an indirect path:

<asm/smp.h> -> <asm/apic.h> -> <asm/fixmap.h> -> <asm/vsyscall.h>

... however, the first such include is conditional on CONFIG_X86_LOCAL_APIC.

Originally-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/CA%2B55aFxsOMc9=p02r8-QhJ=h=Mqwckk4_Pnx9LQt5%2BfqMp_exQ@mail.gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/fault.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 247aae3..f2d4c9d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -17,6 +17,7 @@
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
+#include <asm/fixmap.h>			/* VSYSCALL_START		*/
 
 /*
  * Page fault error code bits:
-- 
cgit v1.1


From 469bfb4a50954c7c1ec506e5399fdb1dbf230650 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 1 Aug 2011 23:07:51 +0200
Subject: Remove unneeded version.h include from arch/x86/

It was pointed out by 'make versioncheck' that the include of
linux/version.h is not needed in arch/x86/mm/mmio-mod.c .
This patch removes it.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/mm/mmio-mod.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3adff7d..c83c3d0 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
-#include <linux/version.h>
 #include <linux/kallsyms.h>
 #include <asm/pgtable.h>
 #include <linux/mmiotrace.h>
-- 
cgit v1.1


From e05139f2569ecf699b229a6473a86cdffed62956 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Wed, 28 Sep 2011 16:56:51 +0100
Subject: x86-64: Don't apply destructive erratum workaround on unaffected CPUs

Erratum 93 applies to AMD K8 CPUs only, and its workaround
(forcing the upper 32 bits of %rip to all get set under certain
conditions) is actually getting in the way of analyzing page
faults occurring during EFI physical mode runtime calls (in
particular the page table walk shown is completely unrelated to
the actual fault). This is because typically EFI runtime code
lives in the space between 2G and 4G, which - modulo the above
manipulation - is likely to overlap with the kernel or modules
area.

While even for the other errata workarounds their taking effect
could be limited to just the affected CPUs, none of them appears
to be destructive, and they're generally getting called only
outside of performance critical paths, so they're being left
untouched.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4E835FE30200007800058464@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0d17c8c..9c7378d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -420,12 +420,14 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
 	return 0;
 }
 
+#ifdef CONFIG_CPU_SUP_AMD
 static const char errata93_warning[] =
 KERN_ERR 
 "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 "******* Working around it, but it may cause SEGVs or burn power.\n"
 "******* Please consider a BIOS update.\n"
 "******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
 
 /*
  * No vm86 mode in 64-bit mode:
@@ -505,7 +507,11 @@ bad:
  */
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+	    || boot_cpu_data.x86 != 0xf)
+		return 0;
+
 	if (address != regs->ip)
 		return 0;
 
-- 
cgit v1.1


From 8548c84da2f47e71bbbe300f55edb768492575f7 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sun, 23 Oct 2011 23:19:12 +0200
Subject: x86: Fix S4 regression

Commit 4b239f458 ("x86-64, mm: Put early page table high") causes a S4
regression since 2.6.39, namely the machine reboots occasionally at S4
resume.  It doesn't happen always, overall rate is about 1/20.  But,
like other bugs, once when this happens, it continues to happen.

This patch fixes the problem by essentially reverting the memory
assignment in the older way.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Cc: <stable@kernel.org>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Yinghai Lu <yinghai.lu@oracle.com>
[ We'll hopefully find the real fix, but that's too late for 3.1 now ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 3032644..87488b9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -63,9 +63,8 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 #ifdef CONFIG_X86_32
 	/* for fixmap */
 	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-
-	good_end = max_pfn_mapped << PAGE_SHIFT;
 #endif
+	good_end = max_pfn_mapped << PAGE_SHIFT;
 
 	base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
 	if (base == MEMBLOCK_ERROR)
-- 
cgit v1.1


From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:36:59 -0700
Subject: mm: thp: tail page refcounting fix

Michel while working on the working set estimation code, noticed that
calling get_page_unless_zero() on a random pfn_to_page(random_pfn)
wasn't safe, if the pfn ended up being a tail page of a transparent
hugepage under splitting by __split_huge_page_refcount().

He then found the problem could also theoretically materialize with
page_cache_get_speculative() during the speculative radix tree lookups
that uses get_page_unless_zero() in SMP if the radix tree page is freed
and reallocated and get_user_pages is called on it before
page_cache_get_speculative has a chance to call get_page_unless_zero().

So the best way to fix the problem is to keep page_tail->_count zero at
all times.  This will guarantee that get_page_unless_zero() can never
succeed on any tail page.  page_tail->_mapcount is guaranteed zero and
is unused for all tail pages of a compound page, so we can simply
account the tail page references there and transfer them to
tail_page->_count in __split_huge_page_refcount() (in addition to the
head_page->_mapcount).

While debugging this s/_count/_mapcount/ change I also noticed get_page is
called by direct-io.c on pages returned by get_user_pages.  That wasn't
entirely safe because the two atomic_inc in get_page weren't atomic.  As
opposed to other get_user_page users like secondary-MMU page fault to
establish the shadow pagetables would never call any superflous get_page
after get_user_page returns.  It's safer to make get_page universally safe
for tail pages and to use get_page_foll() within follow_page (inside
get_user_pages()).  get_page_foll() is safe to do the refcounting for tail
pages without taking any locks because it is run within PT lock protected
critical sections (PT lock for pte and page_table_lock for
pmd_trans_huge).

The standard get_page() as invoked by direct-io instead will now take
the compound_lock but still only for tail pages.  The direct-io paths
are usually I/O bound and the compound_lock is per THP so very
finegrined, so there's no risk of scalability issues with it.  A simple
direct-io benchmarks with all lockdep prove locking and spinlock
debugging infrastructure enabled shows identical performance and no
overhead.  So it's worth it.  Ideally direct-io should stop calling
get_page() on pages returned by get_user_pages().  The spinlock in
get_page() is already optimized away for no-THP builds but doing
get_page() on tail pages returned by GUP is generally a rare operation
and usually only run in I/O paths.

This new refcounting on page_tail->_mapcount in addition to avoiding new
RCU critical sections will also allow the working set estimation code to
work without any further complexity associated to the tail page
refcounting with THP.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dbe34b9..3b5032a 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page)
 	 * __split_huge_page_refcount() cannot run
 	 * from under us.
 	 */
-	VM_BUG_ON(atomic_read(&page->_count) < 0);
-	atomic_inc(&page->_count);
+	VM_BUG_ON(page_mapcount(page) < 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	atomic_inc(&page->_mapcount);
 }
 
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
-- 
cgit v1.1


From b35a35b556f5e6b7993ad0baf20173e75c09ce8c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:36 -0700
Subject: thp: share get_huge_page_tail()

This avoids duplicating the function in every arch gup_fast.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 3b5032a..ea30585 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -108,17 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 	SetPageReferenced(page);
 }
 
-static inline void get_huge_page_tail(struct page *page)
-{
-	/*
-	 * __split_huge_page_refcount() cannot run
-	 * from under us.
-	 */
-	VM_BUG_ON(page_mapcount(page) < 0);
-	VM_BUG_ON(atomic_read(&page->_count) != 0);
-	atomic_inc(&page->_mapcount);
-}
-
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
-- 
cgit v1.1