From 3ac94932a2c859e8c57a8af091fa39334e1c3f23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:36 -0700
Subject: [PATCH] lockdep: beautify x86_64 stacktraces

Beautify x86_64 stacktraces to be more readable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c |  2 +-
 arch/x86_64/kernel/traps.c   | 70 ++++++++++++++++++++------------------------
 2 files changed, 32 insertions(+), 40 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index ca56e19..bb6745d 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -296,7 +296,7 @@ void __show_regs(struct pt_regs * regs)
 		system_utsname.version);
 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
 	printk_address(regs->rip); 
-	printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
+	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
 		regs->eflags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
 	       regs->rax, regs->rbx, regs->rcx);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 5a5311d..02dc155 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -110,28 +110,31 @@ static int kstack_depth_to_print = 12;
 static int call_trace = 1;
 
 #ifdef CONFIG_KALLSYMS
-#include <linux/kallsyms.h> 
-int printk_address(unsigned long address)
-{ 
+# include <linux/kallsyms.h>
+void printk_address(unsigned long address)
+{
 	unsigned long offset = 0, symsize;
 	const char *symname;
 	char *modname;
-	char *delim = ":"; 
+	char *delim = ":";
 	char namebuf[128];
 
-	symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 
-	if (!symname) 
-		return printk("[<%016lx>]", address);
-	if (!modname) 
+	symname = kallsyms_lookup(address, &symsize, &offset,
+					&modname, namebuf);
+	if (!symname) {
+		printk(" [<%016lx>]\n", address);
+		return;
+	}
+	if (!modname)
 		modname = delim = ""; 		
-        return printk("<%016lx>{%s%s%s%s%+ld}",
-		      address, delim, modname, delim, symname, offset); 
-} 
+	printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
+		address, delim, modname, delim, symname, offset, symsize);
+}
 #else
-int printk_address(unsigned long address)
-{ 
-	return printk("[<%016lx>]", address);
-} 
+void printk_address(unsigned long address)
+{
+	printk(" [<%016lx>]\n", address);
+}
 #endif
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -193,20 +196,14 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 
 static int show_trace_unwind(struct unwind_frame_info *info, void *context)
 {
-	int i = 11, n = 0;
+	int n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
-		++n;
-		if (i > 50) {
-			printk("\n       ");
-			i = 7;
-		} else
-			i += printk(" ");
-		i += printk_address(UNW_PC(info));
+		n++;
+		printk_address(UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
 	}
-	printk("\n");
 	return n;
 }
 
@@ -224,7 +221,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 	int i = 11;
 	unsigned used = 0;
 
-	printk("\nCall Trace:");
+	printk("\nCall Trace:\n");
 
 	if (!tsk)
 		tsk = current;
@@ -254,12 +251,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 	do while (cond) { \
 		unsigned long addr = *stack++; \
 		if (kernel_text_address(addr)) { \
-			if (i > 50) { \
-				printk("\n       "); \
-				i = 0; \
-			} \
-			else \
-				i += printk(" "); \
 			/* \
 			 * If the address is either in the text segment of the \
 			 * kernel, or in the region which contains vmalloc'ed \
@@ -268,20 +259,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			 * down the cause of the crash will be able to figure \
 			 * out the call path that was taken. \
 			 */ \
-			i += printk_address(addr); \
+			printk_address(addr); \
 		} \
 	} while (0)
 
-	for(; ; ) {
+	for( ; ; ) {
 		const char *id;
 		unsigned long *estack_end;
 		estack_end = in_exception_stack(cpu, (unsigned long)stack,
 						&used, &id);
 
 		if (estack_end) {
-			i += printk(" <%s>", id);
+			printk(" <%s>", id);
 			HANDLE_STACK (stack < estack_end);
-			i += printk(" <EOE>");
+			printk(" <EOE>");
 			stack = (unsigned long *) estack_end[-2];
 			continue;
 		}
@@ -291,11 +282,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 				(IRQSTACKSIZE - 64) / sizeof(*irqstack);
 
 			if (stack >= irqstack && stack < irqstack_end) {
-				i += printk(" <IRQ>");
+				printk(" <IRQ>");
 				HANDLE_STACK (stack < irqstack_end);
 				stack = (unsigned long *) (irqstack_end[-1]);
 				irqstack_end = NULL;
-				i += printk(" <EOI>");
+				printk(" <EOI>");
 				continue;
 			}
 		}
@@ -304,6 +295,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 
 	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
 #undef HANDLE_STACK
+
 	printk("\n");
 }
 
@@ -337,8 +329,8 @@ static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned
 			break;
 		}
 		if (i && ((i % 4) == 0))
-			printk("\n       ");
-		printk("%016lx ", *stack++);
+			printk("\n");
+		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
 	show_trace(tsk, regs, rsp);
-- 
cgit v1.1


From c9ca1ba5bde45839cdc4f8ab93730cb68e6ee8a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:36 -0700
Subject: [PATCH] lockdep: x86_64 document stack frame internals

Document stack frame nesting internals some more.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 61 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 02dc155..79d05c4 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -152,10 +152,22 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 	};
 	unsigned k;
 
+	/*
+	 * Iterate over all exception stacks, and figure out whether
+	 * 'stack' is in one of them:
+	 */
 	for (k = 0; k < N_EXCEPTION_STACKS; k++) {
 		unsigned long end;
 
+		/*
+		 * set 'end' to the end of the exception stack.
+		 */
 		switch (k + 1) {
+		/*
+		 * TODO: this block is not needed i think, because
+		 * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
+		 * properly too.
+		 */
 #if DEBUG_STKSZ > EXCEPTION_STKSZ
 		case DEBUG_STACK:
 			end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
@@ -165,19 +177,43 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 			end = per_cpu(init_tss, cpu).ist[k];
 			break;
 		}
+		/*
+		 * Is 'stack' above this exception frame's end?
+		 * If yes then skip to the next frame.
+		 */
 		if (stack >= end)
 			continue;
+		/*
+		 * Is 'stack' above this exception frame's start address?
+		 * If yes then we found the right frame.
+		 */
 		if (stack >= end - EXCEPTION_STKSZ) {
+			/*
+			 * Make sure we only iterate through an exception
+			 * stack once. If it comes up for the second time
+			 * then there's something wrong going on - just
+			 * break out and return NULL:
+			 */
 			if (*usedp & (1U << k))
 				break;
 			*usedp |= 1U << k;
 			*idp = ids[k];
 			return (unsigned long *)end;
 		}
+		/*
+		 * If this is a debug stack, and if it has a larger size than
+		 * the usual exception stacks, then 'stack' might still
+		 * be within the lower portion of the debug stack:
+		 */
 #if DEBUG_STKSZ > EXCEPTION_STKSZ
 		if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
 			unsigned j = N_EXCEPTION_STACKS - 1;
 
+			/*
+			 * Black magic. A large debug stack is composed of
+			 * multiple exception stack entries, which we
+			 * iterate through now. Dont look:
+			 */
 			do {
 				++j;
 				end -= EXCEPTION_STKSZ;
@@ -247,6 +283,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 		}
 	}
 
+	/*
+	 * Print function call entries within a stack. 'cond' is the
+	 * "end of stackframe" condition, that the 'stack++'
+	 * iteration will eventually trigger.
+	 */
 #define HANDLE_STACK(cond) \
 	do while (cond) { \
 		unsigned long addr = *stack++; \
@@ -263,7 +304,12 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 		} \
 	} while (0)
 
-	for( ; ; ) {
+	/*
+	 * Print function call entries in all stacks, starting at the
+	 * current stack address. If the stacks consist of nested
+	 * exceptions
+	 */
+	for ( ; ; ) {
 		const char *id;
 		unsigned long *estack_end;
 		estack_end = in_exception_stack(cpu, (unsigned long)stack,
@@ -273,6 +319,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			printk(" <%s>", id);
 			HANDLE_STACK (stack < estack_end);
 			printk(" <EOE>");
+			/*
+			 * We link to the next stack via the
+			 * second-to-last pointer (index -2 to end) in the
+			 * exception stack:
+			 */
 			stack = (unsigned long *) estack_end[-2];
 			continue;
 		}
@@ -284,6 +335,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			if (stack >= irqstack && stack < irqstack_end) {
 				printk(" <IRQ>");
 				HANDLE_STACK (stack < irqstack_end);
+				/*
+				 * We link to the next stack (which would be
+				 * the process stack normally) the last
+				 * pointer (index -1 to end) in the IRQ stack:
+				 */
 				stack = (unsigned long *) (irqstack_end[-1]);
 				irqstack_end = NULL;
 				printk(" <EOI>");
@@ -293,6 +349,9 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 		break;
 	}
 
+	/*
+	 * This prints the process stack:
+	 */
 	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
 #undef HANDLE_STACK
 
-- 
cgit v1.1


From 21b32bbff950771f196da91011249fa05fa83b32 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:40 -0700
Subject: [PATCH] lockdep: stacktrace subsystem, x86_64 support

Framework to generate and save stacktraces quickly, without printing anything
to the console.  x86_64 support.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/Makefile     |   1 +
 arch/x86_64/kernel/stacktrace.c | 221 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 arch/x86_64/kernel/stacktrace.c

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 819e84e..b5aaeaf 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -10,6 +10,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
 		pci-dma.o pci-nommu.o alternative.o
 
+obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
new file mode 100644
index 0000000..32cf55e
--- /dev/null
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -0,0 +1,221 @@
+/*
+ * arch/x86_64/kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+
+#include <asm/smp.h>
+
+static inline int
+in_range(unsigned long start, unsigned long addr, unsigned long end)
+{
+	return addr >= start && addr <= end;
+}
+
+static unsigned long
+get_stack_end(struct task_struct *task, unsigned long stack)
+{
+	unsigned long stack_start, stack_end, flags;
+	int i, cpu;
+
+	/*
+	 * The most common case is that we are in the task stack:
+	 */
+	stack_start = (unsigned long)task->thread_info;
+	stack_end = stack_start + THREAD_SIZE;
+
+	if (in_range(stack_start, stack, stack_end))
+		return stack_end;
+
+	/*
+	 * We are in an interrupt if irqstackptr is set:
+	 */
+	raw_local_irq_save(flags);
+	cpu = safe_smp_processor_id();
+	stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr;
+
+	if (stack_end) {
+		stack_start = stack_end & ~(IRQSTACKSIZE-1);
+		if (in_range(stack_start, stack, stack_end))
+			goto out_restore;
+		/*
+		 * We get here if we are in an IRQ context but we
+		 * are also in an exception stack.
+		 */
+	}
+
+	/*
+	 * Iterate over all exception stacks, and figure out whether
+	 * 'stack' is in one of them:
+	 */
+	for (i = 0; i < N_EXCEPTION_STACKS; i++) {
+		/*
+		 * set 'end' to the end of the exception stack.
+		 */
+		stack_end = per_cpu(init_tss, cpu).ist[i];
+		stack_start = stack_end - EXCEPTION_STKSZ;
+
+		/*
+		 * Is 'stack' above this exception frame's end?
+		 * If yes then skip to the next frame.
+		 */
+		if (stack >= stack_end)
+			continue;
+		/*
+		 * Is 'stack' above this exception frame's start address?
+		 * If yes then we found the right frame.
+		 */
+		if (stack >= stack_start)
+			goto out_restore;
+
+		/*
+		 * If this is a debug stack, and if it has a larger size than
+		 * the usual exception stacks, then 'stack' might still
+		 * be within the lower portion of the debug stack:
+		 */
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+		if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) {
+			/*
+			 * Black magic. A large debug stack is composed of
+			 * multiple exception stack entries, which we
+			 * iterate through now. Dont look:
+			 */
+			do {
+				stack_end -= EXCEPTION_STKSZ;
+				stack_start -= EXCEPTION_STKSZ;
+			} while (stack < stack_start);
+
+			goto out_restore;
+		}
+#endif
+	}
+	/*
+	 * Ok, 'stack' is not pointing to any of the system stacks.
+	 */
+	stack_end = 0;
+
+out_restore:
+	raw_local_irq_restore(flags);
+
+	return stack_end;
+}
+
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer:
+ */
+static inline unsigned long
+save_context_stack(struct stack_trace *trace, unsigned int skip,
+		   unsigned long stack, unsigned long stack_end)
+{
+	unsigned long addr;
+
+#ifdef CONFIG_FRAME_POINTER
+	unsigned long prev_stack = 0;
+
+	while (in_range(prev_stack, stack, stack_end)) {
+		pr_debug("stack:          %p\n", (void *)stack);
+		addr = (unsigned long)(((unsigned long *)stack)[1]);
+		pr_debug("addr:           %p\n", (void *)addr);
+		if (!skip)
+			trace->entries[trace->nr_entries++] = addr-1;
+		else
+			skip--;
+		if (trace->nr_entries >= trace->max_entries)
+			break;
+		if (!addr)
+			return 0;
+		/*
+		 * Stack frames must go forwards (otherwise a loop could
+		 * happen if the stackframe is corrupted), so we move
+		 * prev_stack forwards:
+		 */
+		prev_stack = stack;
+		stack = (unsigned long)(((unsigned long *)stack)[0]);
+	}
+	pr_debug("invalid:        %p\n", (void *)stack);
+#else
+	while (stack < stack_end) {
+		addr = ((unsigned long *)stack)[0];
+		stack += sizeof(long);
+		if (__kernel_text_address(addr)) {
+			if (!skip)
+				trace->entries[trace->nr_entries++] = addr-1;
+			else
+				skip--;
+			if (trace->nr_entries >= trace->max_entries)
+				break;
+		}
+	}
+#endif
+	return stack;
+}
+
+#define MAX_STACKS 10
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ * If all_contexts is set, all contexts (hardirq, softirq and process)
+ * are saved. If not set then only the current context is saved.
+ */
+void save_stack_trace(struct stack_trace *trace,
+		      struct task_struct *task, int all_contexts,
+		      unsigned int skip)
+{
+	unsigned long stack = (unsigned long)&stack;
+	int i, nr_stacks = 0, stacks_done[MAX_STACKS];
+
+	WARN_ON(trace->nr_entries || !trace->max_entries);
+
+	if (!task)
+		task = current;
+
+	pr_debug("task: %p, ti: %p\n", task, task->thread_info);
+
+	if (!task || task == current) {
+		/* Grab rbp right from our regs: */
+		asm ("mov %%rbp, %0" : "=r" (stack));
+		pr_debug("rbp:            %p\n", (void *)stack);
+	} else {
+		/* rbp is the last reg pushed by switch_to(): */
+		stack = task->thread.rsp;
+		pr_debug("other task rsp: %p\n", (void *)stack);
+		stack = (unsigned long)(((unsigned long *)stack)[0]);
+		pr_debug("other task rbp: %p\n", (void *)stack);
+	}
+
+	while (1) {
+		unsigned long stack_end = get_stack_end(task, stack);
+
+		pr_debug("stack:          %p\n", (void *)stack);
+		pr_debug("stack end:      %p\n", (void *)stack_end);
+
+		/*
+		 * Invalid stack addres?
+		 */
+		if (!stack_end)
+			return;
+		/*
+		 * Were we in this stack already? (recursion)
+		 */
+		for (i = 0; i < nr_stacks; i++)
+			if (stacks_done[i] == stack_end)
+				return;
+		stacks_done[nr_stacks] = stack_end;
+
+		stack = save_context_stack(trace, skip, stack, stack_end);
+		if (!all_contexts || !stack ||
+				trace->nr_entries >= trace->max_entries)
+			return;
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+		if (trace->nr_entries >= trace->max_entries)
+			return;
+		if (++nr_stacks >= MAX_STACKS)
+			return;
+	}
+}
+
-- 
cgit v1.1


From 2601e64d262ee5ed4d4a5737345803800d9c4db3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:45 -0700
Subject: [PATCH] lockdep: irqtrace subsystem, x86_64 support

Add irqflags-tracing support to x86_64.

[akpm@osdl.org: build fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S | 188 ++++++++++++++++++++++++++++++++-------------
 arch/x86_64/kernel/irq.c   |   4 +-
 2 files changed, 139 insertions(+), 53 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index ed92c29..d464dde 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -41,13 +41,24 @@
 #include <asm/thread_info.h>
 #include <asm/hw_irq.h>
 #include <asm/page.h>
+#include <asm/irqflags.h>
 
 	.code64
 
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
-	
+
+
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+	jnc  1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
 /*
  * C code is not supposed to know about undefined top of stack. Every time 
  * a C function with an pt_regs argument is called from the SYSCALL based 
@@ -194,6 +205,10 @@ ENTRY(system_call)
 	swapgs
 	movq	%rsp,%gs:pda_oldrsp 
 	movq	%gs:pda_kernelstack,%rsp
+	/*
+	 * No need to follow this irqs off/on section - it's straight
+	 * and short:
+	 */
 	sti					
 	SAVE_ARGS 8,1
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
@@ -219,10 +234,15 @@ ret_from_sys_call:
 sysret_check:		
 	GET_THREAD_INFO(%rcx)
 	cli
+	TRACE_IRQS_OFF
 	movl threadinfo_flags(%rcx),%edx
 	andl %edi,%edx
 	CFI_REMEMBER_STATE
 	jnz  sysret_careful 
+	/*
+	 * sysretq will re-enable interrupts:
+	 */
+	TRACE_IRQS_ON
 	movq RIP-ARGOFFSET(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 0,-ARG_SKIP,1
@@ -237,6 +257,7 @@ sysret_careful:
 	CFI_RESTORE_STATE
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
+	TRACE_IRQS_ON
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -247,6 +268,7 @@ sysret_careful:
 
 	/* Handle a signal */ 
 sysret_signal:
+	TRACE_IRQS_ON
 	sti
 	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
 	jz    1f
@@ -261,6 +283,7 @@ sysret_signal:
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	cli
+	TRACE_IRQS_OFF
 	jmp int_with_check
 	
 badsys:
@@ -309,6 +332,7 @@ ENTRY(int_ret_from_sys_call)
 	CFI_REL_OFFSET	r10,R10-ARGOFFSET
 	CFI_REL_OFFSET	r11,R11-ARGOFFSET
 	cli
+	TRACE_IRQS_OFF
 	testl $3,CS-ARGOFFSET(%rsp)
 	je retint_restore_args
 	movl $_TIF_ALLWORK_MASK,%edi
@@ -327,6 +351,7 @@ int_with_check:
 int_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc  int_very_careful
+	TRACE_IRQS_ON
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -334,10 +359,12 @@ int_careful:
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
 	cli
+	TRACE_IRQS_OFF
 	jmp int_with_check
 
 	/* handle signals and tracing -- both require a full stack frame */
 int_very_careful:
+	TRACE_IRQS_ON
 	sti
 	SAVE_REST
 	/* Check for syscall exit trace */	
@@ -351,6 +378,7 @@ int_very_careful:
 	CFI_ADJUST_CFA_OFFSET -8
 	andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
 	cli
+	TRACE_IRQS_OFF
 	jmp int_restore_rest
 	
 int_signal:
@@ -363,6 +391,7 @@ int_signal:
 int_restore_rest:
 	RESTORE_REST
 	cli
+	TRACE_IRQS_OFF
 	jmp int_with_check
 	CFI_ENDPROC
 END(int_ret_from_sys_call)
@@ -484,6 +513,10 @@ END(stub_rt_sigreturn)
 	swapgs	
 1:	incl	%gs:pda_irqcount	# RED-PEN should check preempt count
 	cmoveq %gs:pda_irqstackptr,%rsp
+	/*
+	 * We entered an interrupt context - irqs are off:
+	 */
+	TRACE_IRQS_OFF
 	call \func
 	.endm
 
@@ -493,6 +526,7 @@ ENTRY(common_interrupt)
 	/* 0(%rsp): oldrsp-ARGOFFSET */
 ret_from_intr:
 	cli	
+	TRACE_IRQS_OFF
 	decl %gs:pda_irqcount
 	leaveq
 	CFI_DEF_CFA_REGISTER	rsp
@@ -515,9 +549,21 @@ retint_check:
 	CFI_REMEMBER_STATE
 	jnz  retint_careful
 retint_swapgs:	 	
+	/*
+	 * The iretq could re-enable interrupts:
+	 */
+	cli
+	TRACE_IRQS_IRETQ
 	swapgs 
+	jmp restore_args
+
 retint_restore_args:				
 	cli
+	/*
+	 * The iretq could re-enable interrupts:
+	 */
+	TRACE_IRQS_IRETQ
+restore_args:
 	RESTORE_ARGS 0,8,0						
 iret_label:	
 	iretq
@@ -530,6 +576,7 @@ iret_label:
 	/* running with kernel gs */
 bad_iret:
 	movq $11,%rdi	/* SIGSEGV */
+	TRACE_IRQS_ON
 	sti
 	jmp do_exit			
 	.previous	
@@ -539,6 +586,7 @@ retint_careful:
 	CFI_RESTORE_STATE
 	bt    $TIF_NEED_RESCHED,%edx
 	jnc   retint_signal
+	TRACE_IRQS_ON
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
@@ -547,11 +595,13 @@ retint_careful:
 	CFI_ADJUST_CFA_OFFSET	-8
 	GET_THREAD_INFO(%rcx)
 	cli
+	TRACE_IRQS_OFF
 	jmp retint_check
 	
 retint_signal:
 	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
 	jz    retint_swapgs
+	TRACE_IRQS_ON
 	sti
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp) 			
@@ -560,6 +610,7 @@ retint_signal:
 	call do_notify_resume
 	RESTORE_REST
 	cli
+	TRACE_IRQS_OFF
 	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
@@ -666,7 +717,7 @@ END(spurious_interrupt)
 
 	/* error code is on the stack already */
 	/* handle NMI like exceptions that can happen everywhere */
-	.macro paranoidentry sym, ist=0
+	.macro paranoidentry sym, ist=0, irqtrace=1
 	SAVE_ALL
 	cld
 	movl $1,%ebx
@@ -691,8 +742,73 @@ END(spurious_interrupt)
 	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
 	.endif
 	cli
+	.if \irqtrace
+	TRACE_IRQS_OFF
+	.endif
 	.endm
-	
+
+	/*
+ 	 * "Paranoid" exit path from exception stack.
+  	 * Paranoid because this is used by NMIs and cannot take
+	 * any kernel state for granted.
+	 * We don't do kernel preemption checks here, because only
+	 * NMI should be common and it does not enable IRQs and
+	 * cannot get reschedule ticks.
+	 *
+	 * "trace" is 0 for the NMI handler only, because irq-tracing
+	 * is fundamentally NMI-unsafe. (we cannot change the soft and
+	 * hard flags at once, atomically)
+	 */
+	.macro paranoidexit trace=1
+	/* ebx:	no swapgs flag */
+paranoid_exit\trace:
+	testl %ebx,%ebx				/* swapgs needed? */
+	jnz paranoid_restore\trace
+	testl $3,CS(%rsp)
+	jnz   paranoid_userspace\trace
+paranoid_swapgs\trace:
+	TRACE_IRQS_IRETQ 0
+	swapgs
+paranoid_restore\trace:
+	RESTORE_ALL 8
+	iretq
+paranoid_userspace\trace:
+	GET_THREAD_INFO(%rcx)
+	movl threadinfo_flags(%rcx),%ebx
+	andl $_TIF_WORK_MASK,%ebx
+	jz paranoid_swapgs\trace
+	movq %rsp,%rdi			/* &pt_regs */
+	call sync_regs
+	movq %rax,%rsp			/* switch stack for scheduling */
+	testl $_TIF_NEED_RESCHED,%ebx
+	jnz paranoid_schedule\trace
+	movl %ebx,%edx			/* arg3: thread flags */
+	.if \trace
+	TRACE_IRQS_ON
+	.endif
+	sti
+	xorl %esi,%esi 			/* arg2: oldset */
+	movq %rsp,%rdi 			/* arg1: &pt_regs */
+	call do_notify_resume
+	cli
+	.if \trace
+	TRACE_IRQS_OFF
+	.endif
+	jmp paranoid_userspace\trace
+paranoid_schedule\trace:
+	.if \trace
+	TRACE_IRQS_ON
+	.endif
+	sti
+	call schedule
+	cli
+	.if \trace
+	TRACE_IRQS_OFF
+	.endif
+	jmp paranoid_userspace\trace
+	CFI_ENDPROC
+	.endm
+
 /*
  * Exception entry point. This expects an error code/orig_rax on the stack
  * and the exception handler in %rax.	
@@ -748,6 +864,7 @@ error_exit:
 	movl %ebx,%eax		
 	RESTORE_REST
 	cli
+	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)	
 	testl %eax,%eax
 	jne  retint_kernel
@@ -755,6 +872,10 @@ error_exit:
 	movl  $_TIF_WORK_MASK,%edi
 	andl  %edi,%edx
 	jnz  retint_careful
+	/*
+	 * The iret might restore flags:
+	 */
+	TRACE_IRQS_IRETQ
 	swapgs 
 	RESTORE_ARGS 0,8,0						
 	jmp iret_label
@@ -916,8 +1037,7 @@ KPROBE_ENTRY(debug)
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8		
 	paranoidentry do_debug, DEBUG_STACK
-	jmp paranoid_exit
-	CFI_ENDPROC
+	paranoidexit
 END(debug)
 	.previous .text
 
@@ -926,49 +1046,13 @@ KPROBE_ENTRY(nmi)
 	INTR_FRAME
 	pushq $-1
 	CFI_ADJUST_CFA_OFFSET 8
-	paranoidentry do_nmi
-	/*
- 	 * "Paranoid" exit path from exception stack.
-  	 * Paranoid because this is used by NMIs and cannot take
-	 * any kernel state for granted.
-	 * We don't do kernel preemption checks here, because only
-	 * NMI should be common and it does not enable IRQs and
-	 * cannot get reschedule ticks.
-	 */
-	/* ebx:	no swapgs flag */
-paranoid_exit:
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore
-	testl $3,CS(%rsp)
-	jnz   paranoid_userspace
-paranoid_swapgs:	
-	swapgs
-paranoid_restore:	
-	RESTORE_ALL 8
-	iretq
-paranoid_userspace:	
-	GET_THREAD_INFO(%rcx)
-	movl threadinfo_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz paranoid_swapgs
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz paranoid_schedule
-	movl %ebx,%edx			/* arg3: thread flags */
-	sti
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	cli
-	jmp paranoid_userspace
-paranoid_schedule:
-	sti
-	call schedule
-	cli
-	jmp paranoid_userspace
-	CFI_ENDPROC
+	paranoidentry do_nmi, 0, 0
+#ifdef CONFIG_TRACE_IRQFLAGS
+	paranoidexit 0
+#else
+	jmp paranoid_exit1
+ 	CFI_ENDPROC
+#endif
 END(nmi)
 	.previous .text
 
@@ -977,7 +1061,7 @@ KPROBE_ENTRY(int3)
  	pushq $0
  	CFI_ADJUST_CFA_OFFSET 8
  	paranoidentry do_int3, DEBUG_STACK
- 	jmp paranoid_exit
+ 	jmp paranoid_exit1
  	CFI_ENDPROC
 END(int3)
 	.previous .text
@@ -1006,7 +1090,7 @@ END(reserved)
 ENTRY(double_fault)
 	XCPT_FRAME
 	paranoidentry do_double_fault
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(double_fault)
 
@@ -1022,7 +1106,7 @@ END(segment_not_present)
 ENTRY(stack_segment)
 	XCPT_FRAME
 	paranoidentry do_stack_segment
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(stack_segment)
 
@@ -1050,7 +1134,7 @@ ENTRY(machine_check)
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8	
 	paranoidentry do_machine_check
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(machine_check)
 #endif
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index a1f1df5..5221a53 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -177,8 +177,10 @@ asmlinkage void do_softirq(void)
  	local_irq_save(flags);
  	pending = local_softirq_pending();
  	/* Switch to interrupt stack */
- 	if (pending)
+ 	if (pending) {
 		call_softirq();
+		WARN_ON_ONCE(softirq_count());
+	}
  	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(do_softirq);
-- 
cgit v1.1


From 2148270cd2ebe0d05e4289b7c77b1435c45481bf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:24:57 -0700
Subject: [PATCH] lockdep: x86_64 early init

x86_64 uses spinlocks very early - earlier than start_kernel().  So call
lockdep_init() from the arch setup code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/head64.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index e6a71c9..36647ce 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -85,6 +85,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	clear_bss();
 
 	/*
+	 * This must be called really, really early:
+	 */
+	lockdep_init();
+
+	/*
 	 * switch to init_level4_pgt from boot_level4_pgt
 	 */
 	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
-- 
cgit v1.1


From 366c7f554e888e51b8395f9b07b273fe775c7ff3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:25 -0700
Subject: [PATCH] lockdep: annotate enable_in_hardirq()

Make use of local_irq_enable_in_hardirq() API to annotate places that enable
hardirqs in hardirq context.

Has no effect on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/nmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 476c1472..5baa0c7 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -127,7 +127,7 @@ void __cpuinit nmi_watchdog_default(void)
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
-	local_irq_enable();
+	local_irq_enable_in_hardirq();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
-- 
cgit v1.1


From 60be6b9a41cb0da0df7a9f11486da56baebf04cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Jul 2006 00:25:26 -0700
Subject: [PATCH] lockdep: annotate on-stack completions

lockdep needs to have the waitqueue lock initialized for on-stack waitqueues
implicitly initialized by DECLARE_COMPLETION().  Annotate on-stack completions
accordingly.

Has no effect on non-lockdep kernels.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smpboot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 9705a6a..b7c7059 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -775,6 +775,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	};
 	DECLARE_WORK(work, do_fork_idle, &c_idle);
 
+	lockdep_set_class(&c_idle.done.wait.lock, &waitqueue_lock_key);
+
 	/* allocate memory for gdts of secondary cpus. Hotplug is considered */
 	if (!cpu_gdt_descr[cpu].address &&
 		!(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-- 
cgit v1.1


From 1a91023a9f172f820e292f2c5675fb9f8e2636f0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 10 Jul 2006 04:43:49 -0700
Subject: [PATCH] x86_64: e820.c needs pgtable.h

arch/x86_64/kernel/e820.c:42: error: 'MAXMEM' undeclared here (not in a function)

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index b8eee4c..e56c2ad 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -17,6 +17,7 @@
 #include <linux/kexec.h>
 #include <linux/module.h>
 
+#include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/e820.h>
 #include <asm/proto.h>
-- 
cgit v1.1


From f86bf9b7bcc5d325687a8b80da8ee3eb56e02da7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 10 Jul 2006 04:44:05 -0700
Subject: [PATCH] lockdep: clean up completion initializer in smpboot.c

Clean up lockdep on-stack-completion initializer.  (This also removes the
dependency on waitqueue_lock_key.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smpboot.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index b7c7059..9753802 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -771,12 +771,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	unsigned long start_rip;
 	struct create_idle c_idle = {
 		.cpu = cpu,
-		.done = COMPLETION_INITIALIZER(c_idle.done),
+		.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 	};
 	DECLARE_WORK(work, do_fork_idle, &c_idle);
 
-	lockdep_set_class(&c_idle.done.wait.lock, &waitqueue_lock_key);
-
 	/* allocate memory for gdts of secondary cpus. Hotplug is considered */
 	if (!cpu_gdt_descr[cpu].address &&
 		!(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-- 
cgit v1.1


From 1454aed92b6b89cb1fbe3cbecd8ceaa7a122f3b7 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Mon, 10 Jul 2006 04:44:05 -0700
Subject: [PATCH] put a comment at register_die_notifier that the export is
 used

{un}register_die_notifier() is used by kdb... document this so that future
"remove dead export" rounds can skip this export.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 79d05c4..eb39a27 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -76,13 +76,13 @@ int register_die_notifier(struct notifier_block *nb)
 	vmalloc_sync_all();
 	return atomic_notifier_chain_register(&die_chain, nb);
 }
-EXPORT_SYMBOL(register_die_notifier);
+EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
 
 int unregister_die_notifier(struct notifier_block *nb)
 {
 	return atomic_notifier_chain_unregister(&die_chain, nb);
 }
-EXPORT_SYMBOL(unregister_die_notifier);
+EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
-- 
cgit v1.1


From 894673ee6122a3ce1958e1fe096901ba5356a96b Mon Sep 17 00:00:00 2001
From: Jon Smirl <jonsmir@gmail.com>
Date: Mon, 10 Jul 2006 04:44:13 -0700
Subject: [PATCH] tty: Remove include of screen_info.h from tty.h

screen_info.h doesn't have anything to do with the tty layer and shouldn't be
included by tty.h.  This patches removes the include and modifies all users to
directly include screen_info.h.  struct screen_info is mainly used to
communicate with the console drivers in drivers/video/console.  Note that this
patch touches every arch and I have no way of testing it.  If there is a
mistake the worst thing that will happen is a compile error.

[akpm@osdl.org: fix arm build]
[akpm@osdl.org: fix alpha build]
Signed-off-by: Jon Smirl <jonsmir@gmail.com>
Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/early_printk.c | 2 +-
 arch/x86_64/kernel/setup.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index b93ef5b..140051e 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -2,7 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/fcntl.h>
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0925518..8a099ff 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -21,7 +21,7 @@
 #include <linux/slab.h>
 #include <linux/user.h>
 #include <linux/a.out.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/init.h>
-- 
cgit v1.1


From 0d2caebd562a20188a0d7f4e4f516b7ed69f319e Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.shin@amd.com>
Date: Mon, 10 Jul 2006 17:06:09 +0200
Subject: [PATCH] x86_64: Fix hotplug problem in mce amd

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce_amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 335200a..db2acbf 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -597,7 +597,7 @@ static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
 		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
-		per_cpu(threshold_banks, i)[bank] = NULL;
+		per_cpu(threshold_banks, cpu)[bank] = NULL;
 		return;
 	}
 
-- 
cgit v1.1


From aa0a9f373e3edb2c090f3fa0eb292712cfa97f81 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Mon, 10 Jul 2006 17:06:15 +0200
Subject: [PATCH] x86_64: Fix Calgary copyright statements per IBM guidelines

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-calgary.c | 6 ++++--
 arch/x86_64/kernel/tce.c         | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index d91cb843..e71ed53 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -1,9 +1,11 @@
 /*
  * Derived from arch/powerpc/kernel/iommu.c
  *
- * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
- * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ * Copyright (C) IBM Corporation, 2006
  *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index 8d4c67f..d3a9e79 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -1,8 +1,10 @@
 /*
  * Derived from arch/powerpc/platforms/pseries/iommu.c
  *
- * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation
- * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
-- 
cgit v1.1


From d5a2601734bcc740ee78dc4cb0c56b5687da7bd9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 28 Jul 2006 14:44:42 +0200
Subject: [PATCH] i386/x86-64: Add user_mode checks to profile_pc for oprofile

Fixes a obscure user space triggerable crash during oprofiling.

Oprofile calls profile_pc from NMIs even when user_mode(regs) is not true and
the program counter is inside the kernel lock section. This opens
a race - when a user program jumps to a kernel lock address and
a NMI happens before the illegal page fault exception is raised
and the program has a unmapped esp or ebp then the kernel could
oops. NMIs have a higher priority than exceptions so that could
happen.

Add user_mode checks to i386/x86-64 profile_pc to prevent that.

Cc: John Levon <levon@movementarian.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index b9ff759..e0341c6 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -193,7 +193,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 	   is just accounted to the spinlock function.
 	   Better would be to write these functions in assembler again
 	   and check exactly. */
-	if (in_lock_functions(pc)) {
+	if (!user_mode(regs) && in_lock_functions(pc)) {
 		char *v = *(char **)regs->rsp;
 		if ((v >= _stext && v <= _etext) ||
 			(v >= _sinittext && v <= _einittext) ||
-- 
cgit v1.1


From b13761ecd1d9977d2083da243e051e9f29097aef Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 28 Jul 2006 14:44:51 +0200
Subject: [PATCH] x86_64: Dump leftover backtrace entries when dwarf2 unwinder
 got stuck

The dwarf2 unwinder currently often gets stuck because a lot
of assembly code doesn't have proper dwarf2 annotiation yet.

This currently often happens with __down. Should fix this by
adding proper dwarf2 annotation to all inline assembly. However
until that's done we need a quick fix for 2.6.18 to avoid
incomplete backtraces.

So when this happens dump the rest of the stack with the old unwinder
instead of silently not dumping it. There was already a optional
"both" mode that dumped both, but that was too ugly.

I also clarified the headers for the different backtraces a bit.

Also add a clear error message for missing dwarf2
annotation that people can work on.

And I removed a dead variable left over from Ingo's changes.

Cc: mingo@elte.hu
Cc: jbeulich@novell.com
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index eb39a27..f7a9d14 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -254,7 +254,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 {
 	const unsigned cpu = safe_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
-	int i = 11;
 	unsigned used = 0;
 
 	printk("\nCall Trace:\n");
@@ -275,11 +274,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			if (unwind_init_blocked(&info, tsk) == 0)
 				unw_ret = show_trace_unwind(&info, NULL);
 		}
-		if (unw_ret > 0) {
-			if (call_trace > 0)
+		if (unw_ret > 0 && !arch_unw_user_mode(&info)) {
+#ifdef CONFIG_STACK_UNWIND
+			unsigned long rip = info.regs.rip;
+			print_symbol("DWARF2 unwinder stuck at %s\n", rip);
+			if (call_trace == 1) {
+				printk("Leftover inexact backtrace:\n");
+				stack = (unsigned long *)info.regs.rsp;
+			} else if (call_trace > 1)
 				return;
-			printk("Legacy call trace:");
-			i = 18;
+			else
+				printk("Full inexact backtrace again:\n");
+#else
+			printk("Inexact backtrace:\n");
+#endif
 		}
 	}
 
@@ -1118,8 +1126,10 @@ static int __init call_trace_setup(char *s)
 		call_trace = -1;
 	else if (strcmp(s, "both") == 0)
 		call_trace = 0;
-	else if (strcmp(s, "new") == 0)
+	else if (strcmp(s, "newfallback") == 0)
 		call_trace = 1;
+	else if (strcmp(s, "new") == 0)
+		call_trace = 2;
 	return 1;
 }
 __setup("call_trace=", call_trace_setup);
-- 
cgit v1.1


From 0e5f61b00c577da698fb00cd9c91a96b79044dfd Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 29 Jul 2006 21:42:37 +0200
Subject: [PATCH] x86_64: On Intel systems when CPU has C3 don't use TSC

On Intel systems generally the TSC stops in C3 or deeper,
so don't use it there. Follows similar logic on i386.

This should fix problems on Meroms.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index e0341c6..7a9b182 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -28,6 +28,7 @@
 #include <linux/acpi.h>
 #ifdef CONFIG_ACPI
 #include <acpi/achware.h>	/* for PM timer frequency */
+#include <acpi/acpi_bus.h>
 #endif
 #include <asm/8253pit.h>
 #include <asm/pgtable.h>
@@ -953,11 +954,18 @@ __cpuinit int unsynchronized_tsc(void)
 #ifdef CONFIG_SMP
 	if (apic_is_clustered_box())
 		return 1;
- 	/* Intel systems are normally all synchronized. Exceptions
- 	   are handled in the check above. */
- 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- 		return 0;
 #endif
+	/* Most intel systems have synchronized TSCs except for
+	   multi node systems */
+ 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+		/* But TSC doesn't tick in C3 so don't use it there */
+		if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100)
+			return 1;
+#endif
+ 		return 0;
+	}
+
  	/* Assume multi socket systems are not synchronized */
  	return num_present_cpus() > 1;
 }
-- 
cgit v1.1


From 089bbbcb36979166131868a89ca5f4e695d6637d Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Sat, 29 Jul 2006 21:42:40 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - fix off by one error

Fixed off-by-one error in detect_calgary and calgary_init which will
cause arrays to overflow.  Also, removed impossible to hit BUG_ON.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-calgary.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index e71ed53..92744ab 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -812,7 +812,7 @@ static int __init calgary_init(void)
 	int i, ret = -ENODEV;
 	struct pci_dev *dev = NULL;
 
-	for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) {
+	for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) {
 		dev = pci_get_device(PCI_VENDOR_ID_IBM,
 				     PCI_DEVICE_ID_IBM_CALGARY,
 				     dev);
@@ -890,9 +890,8 @@ void __init detect_calgary(void)
 	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
 
 	for (bus = 0, table_idx = 0;
-	     bus <= num_online_nodes() * MAX_PHB_BUS_NUM;
+	     bus < num_online_nodes() * MAX_PHB_BUS_NUM;
 	     bus++) {
-		BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM);
 		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
 			continue;
 		if (test_bit(bus, translation_disabled)) {
@@ -1002,7 +1001,7 @@ static int __init calgary_parse_options(char *p)
 			if (p == endp)
 				break;
 
-			if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) {
+			if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) {
 				printk(KERN_INFO "Calgary: disabling "
 				       "translation for PHB 0x%x\n", bridge);
 				set_bit(bridge, translation_disabled);
-- 
cgit v1.1


From d2105b10fe0f460c388fe4e09226313f519d8c00 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Sat, 29 Jul 2006 21:42:43 +0200
Subject: [PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference
 fix

Calgary hits a NULL pointer dereference when booting in a multi-chassis
NUMA system.  See Redhat bugzilla number 198498, found by Konrad
Rzeszutek (konradr@redhat.com).

There are many issues that had to be resolved to fix this problem.
Firstly when I originally wrote the code to handle NUMA systems, I
had a large misunderstanding that was not corrected until now.  That was
that I thought the "number of nodes online" referred to number of
physical systems connected.  So that if NUMA was disabled, there
would only be 1 node and it would only show that node's PCI bus.
In reality if NUMA is disabled, the system displays all of the
connected chassis as one node but is only ignorant of the delays
in accessing main memory.  Therefore, references to num_online_nodes()
and MAX_NUMNODES are incorrect and need to be set to the maximum
number of nodes that can be accessed (which are 8).  I created a
variable, MAX_NUM_CHASSIS, and set it to 8 to fix this.

Secondly, when walking the PCI in detect_calgary, the code only
checked the first "slot" when looking to see if a device is present.
This will work for most cases, but unfortunately it isn't always the
case.  In the NUMA MXE drawers, there are USB devices present on the
3rd slot (with slot 1 being empty).  So, to work around this, all
slots (up to 8) are scanned to see if there are any devices present.

Lastly, the bus is being enumerated on large systems in a different
way the we originally thought.  This throws the ugly logic we had
out the window.  To more elegantly handle this, I reorganized the
kva array to be sparse (which removed the need to have any bus number
to kva slot logic in tce.c) and created a secondary space array to
contain the bus number to phb mapping.

With these changes Calgary boots on an x460 with 4 nodes with and
without NUMA enabled.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-calgary.c | 76 +++++++++++++++++++++++-----------------
 arch/x86_64/kernel/tce.c         |  4 +--
 2 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 92744ab..146924b 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -85,7 +85,8 @@
 #define CSR_AGENT_MASK		0xffe0ffff
 
 #define MAX_NUM_OF_PHBS		8 /* how many PHBs in total? */
-#define MAX_PHB_BUS_NUM		(MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */
+#define MAX_NUM_CHASSIS		8 /* max number of chassis */
+#define MAX_PHB_BUS_NUM		(MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */
 #define PHBS_PER_CALGARY	4
 
 /* register offsets in Calgary's internal register space */
@@ -110,7 +111,8 @@ static const unsigned long phb_offsets[] = {
 	0xB000 /* PHB3 */
 };
 
-void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES];
+static char bus_to_phb[MAX_PHB_BUS_NUM];
+void* tce_table_kva[MAX_PHB_BUS_NUM];
 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
 static int translate_empty_slots __read_mostly = 0;
 static int calgary_detected __read_mostly = 0;
@@ -119,7 +121,7 @@ static int calgary_detected __read_mostly = 0;
  * the bitmap of PHBs the user requested that we disable
  * translation on.
  */
-static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM);
+static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM);
 
 static void tce_cache_blast(struct iommu_table *tbl);
 
@@ -452,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
 
 static inline int busno_to_phbid(unsigned char num)
 {
-	return bus_to_phb(num) % PHBS_PER_CALGARY;
+	return bus_to_phb[num];
 }
 
 static inline unsigned long split_queue_offset(unsigned char num)
@@ -812,7 +814,7 @@ static int __init calgary_init(void)
 	int i, ret = -ENODEV;
 	struct pci_dev *dev = NULL;
 
-	for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) {
+	for (i = 0; i < MAX_PHB_BUS_NUM; i++) {
 		dev = pci_get_device(PCI_VENDOR_ID_IBM,
 				     PCI_DEVICE_ID_IBM_CALGARY,
 				     dev);
@@ -822,7 +824,7 @@ static int __init calgary_init(void)
 			calgary_init_one_nontraslated(dev);
 			continue;
 		}
-		if (!tce_table_kva[i] && !translate_empty_slots) {
+		if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) {
 			pci_dev_put(dev);
 			continue;
 		}
@@ -842,7 +844,7 @@ error:
 			pci_dev_put(dev);
 			continue;
 		}
-		if (!tce_table_kva[i] && !translate_empty_slots)
+		if (!tce_table_kva[dev->bus->number] && !translate_empty_slots)
 			continue;
 		calgary_disable_translation(dev);
 		calgary_free_tar(dev);
@@ -876,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram)
 void __init detect_calgary(void)
 {
 	u32 val;
-	int bus, table_idx;
+	int bus;
 	void *tbl;
-	int detected = 0;
+	int calgary_found = 0;
+	int phb = -1;
 
 	/*
 	 * if the user specified iommu=off or iommu=soft or we found
@@ -889,37 +892,46 @@ void __init detect_calgary(void)
 
 	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
 
-	for (bus = 0, table_idx = 0;
-	     bus < num_online_nodes() * MAX_PHB_BUS_NUM;
-	     bus++) {
+	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+		int dev;
+
+		tce_table_kva[bus] = NULL;
+		bus_to_phb[bus] = -1;
+
 		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
 			continue;
+
+		/*
+		 * There are 4 PHBs per Calgary chip.  Set phb to which phb (0-3)
+		 * it is connected to releative to the clagary chip.
+		 */
+		phb = (phb + 1) % PHBS_PER_CALGARY;
+
 		if (test_bit(bus, translation_disabled)) {
 			printk(KERN_INFO "Calgary: translation is disabled for "
 			       "PHB 0x%x\n", bus);
 			/* skip this phb, don't allocate a tbl for it */
-			tce_table_kva[table_idx] = NULL;
-			table_idx++;
 			continue;
 		}
 		/*
-		 * scan the first slot of the PCI bus to see if there
-		 * are any devices present
+		 * Scan the slots of the PCI bus to see if there is a device present.
+		 * The parent bus will be the zero-ith device, so start at 1.
 		 */
-		val = read_pci_config(bus, 1, 0, 0);
-		if (val != 0xffffffff || translate_empty_slots) {
-			tbl = alloc_tce_table();
-			if (!tbl)
-				goto cleanup;
-			detected = 1;
-		} else
-			tbl = NULL;
-
-		tce_table_kva[table_idx] = tbl;
-		table_idx++;
+		for (dev = 1; dev < 8; dev++) {
+			val = read_pci_config(bus, dev, 0, 0);
+			if (val != 0xffffffff || translate_empty_slots) {
+				tbl = alloc_tce_table();
+				if (!tbl)
+					goto cleanup;
+				tce_table_kva[bus] = tbl;
+				bus_to_phb[bus] = phb;
+				calgary_found = 1;
+				break;
+			}
+		}
 	}
 
-	if (detected) {
+	if (calgary_found) {
 		iommu_detected = 1;
 		calgary_detected = 1;
 		printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
@@ -928,9 +940,9 @@ void __init detect_calgary(void)
 	return;
 
 cleanup:
-	for (--table_idx; table_idx >= 0; --table_idx)
-		if (tce_table_kva[table_idx])
-			free_tce_table(tce_table_kva[table_idx]);
+	for (--bus; bus >= 0; --bus)
+		if (tce_table_kva[bus])
+			free_tce_table(tce_table_kva[bus]);
 }
 
 int __init calgary_iommu_init(void)
@@ -1001,7 +1013,7 @@ static int __init calgary_parse_options(char *p)
 			if (p == endp)
 				break;
 
-			if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) {
+			if (bridge < MAX_PHB_BUS_NUM) {
 				printk(KERN_INFO "Calgary: disabling "
 				       "translation for PHB 0x%x\n", bridge);
 				set_bit(bridge, translation_disabled);
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index d3a9e79..5530dda 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -96,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size)
 static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
 {
 	unsigned int bitmapsz;
-	unsigned int tce_table_index;
 	unsigned long bmppages;
 	int ret;
 
@@ -105,8 +104,7 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
 	/* set the tce table size - measured in entries */
 	tbl->it_size = table_size_to_number_of_entries(specified_table_size);
 
-	tce_table_index = bus_to_phb(tbl->it_busno);
-	tbl->it_base = (unsigned long)tce_table_kva[tce_table_index];
+	tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
 	if (!tbl->it_base) {
 		printk(KERN_ERR "Calgary: iommu_table_setparms: "
 		       "no table allocated?!\n");
-- 
cgit v1.1


From 65f87d8a8a6e1b560c61951d0a68ed80f7c8ff19 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 29 Jul 2006 21:42:49 +0200
Subject: [PATCH] x86_64: Fix swiotlb=force

It was broken before. But having it is important as possible hardware
bug workaround.

And previously there was no way to force swiotlb if there is another IOMMU.
Side effect is that iommu=force won't force swiotlb anymore even if there
isn't another IOMMU.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-swiotlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index ebdb77f..6a55f87 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -31,9 +31,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
 void pci_swiotlb_init(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
-	if (!iommu_detected && !no_iommu &&
-	    (end_pfn > MAX_DMA32_PFN || force_iommu))
+	if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
 	       swiotlb = 1;
+	if (swiotlb_force)
+		swiotlb = 1;
 	if (swiotlb) {
 		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
 		swiotlb_init();
-- 
cgit v1.1


From 2a8a3d5b65e86ec1dfef7d268c64a909eab94af7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 30 Jul 2006 03:03:20 -0700
Subject: [PATCH] machine_kexec.c: Fix the description of segment handling

One of my original comments in machine_kexec was unclear
and this should fix it.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: Horms <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/machine_kexec.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 83fb24a..106076b 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -207,14 +207,11 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	__flush_tlb();
 
 
-	/* The segment registers are funny things, they are
-	 * automatically loaded from a table, in memory wherever you
-	 * set them to a specific selector, but this table is never
-	 * accessed again unless you set the segment to a different selector.
-	 *
-	 * The more common model are caches where the behide
-	 * the scenes work is done, but is also dropped at arbitrary
-	 * times.
+	/* The segment registers are funny things, they have both a
+	 * visible and an invisible part.  Whenever the visible part is
+	 * set to a specific selector, the invisible part is loaded
+	 * with from a table in memory.  At no other time is the
+	 * descriptor table in memory accessed.
 	 *
 	 * I take advantage of this here by force loading the
 	 * segments, before I zap the gdt with an invalid value.
-- 
cgit v1.1


From cea6a4ba8acfba6f59cc9ed71e0d05cb770b9d9c Mon Sep 17 00:00:00 2001
From: Horms <horms@verge.net.au>
Date: Sun, 30 Jul 2006 03:03:34 -0700
Subject: [PATCH] panic_on_oops: remove ssleep()

This patch is part of an effort to unify the panic_on_oops behaviour across
all architectures that implement it.

It was pointed out to me by Andi Kleen that if an oops has occured in
interrupt context, then calling sleep() in the oops path will only cause a
panic, and that it would be really better for it not to be in the path at
all.

This patch removes the ssleep() call and reworks the console message
accordinly.  I have a slght concern that the resulting console message is
too long, feedback welcome.

For powerpc it also unifies the 32bit and 64bit behaviour.

Fror x86_64, this patch only updates the console message, as ssleep() is
already not present.

Signed-off-by: Horms <horms@verge.net.au>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index f7a9d14..4e9938d 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -529,7 +529,7 @@ void __kprobes oops_end(unsigned long flags)
 		/* Nest count reaches zero, release the lock. */
 		spin_unlock_irqrestore(&die_lock, flags);
 	if (panic_on_oops)
-		panic("Oops");
+		panic("Fatal exception: panic_on_oops");
 }
 
 void __kprobes __die(const char * str, struct pt_regs * regs, long err)
-- 
cgit v1.1


From be6b5a3505fa0cd54c3b5959a39293f47c648980 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Sun, 30 Jul 2006 03:03:37 -0700
Subject: [PATCH] cpu hotplug: use hotplug version of registration in late
 inits

Use hotplug version of register_cpu_notifier in late init functions.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c     | 12 +++++-------
 arch/x86_64/kernel/mce_amd.c | 19 +++++++------------
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 8884567..4e017fb 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -615,7 +615,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static void mce_remove_device(unsigned int cpu)
 {
 	int i;
 
@@ -626,10 +626,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
 }
-#endif
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int
+static int
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -638,18 +637,17 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_ONLINE:
 		mce_create_device(cpu);
 		break;
-#ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 		mce_remove_device(cpu);
 		break;
-#endif
 	}
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata mce_cpu_notifier = {
+static struct notifier_block mce_cpu_notifier = {
 	.notifier_call = mce_cpu_callback,
 };
+#endif
 
 static __init int mce_init_device(void)
 {
@@ -664,7 +662,7 @@ static __init int mce_init_device(void)
 		mce_create_device(i);
 	}
 
-	register_cpu_notifier(&mce_cpu_notifier);
+	register_hotcpu_notifier(&mce_cpu_notifier);
 	misc_register(&mce_log_device);
 	return err;
 }
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index db2acbf..883fe74 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -558,7 +558,7 @@ out:
  *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
  */
 
-static __cpuinit void deallocate_threshold_block(unsigned int cpu,
+static void deallocate_threshold_block(unsigned int cpu,
 						 unsigned int bank)
 {
 	struct threshold_block *pos = NULL;
@@ -578,7 +578,7 @@ static __cpuinit void deallocate_threshold_block(unsigned int cpu,
 	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
 }
 
-static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
+static void threshold_remove_bank(unsigned int cpu, int bank)
 {
 	int i = 0;
 	struct threshold_bank *b;
@@ -618,7 +618,7 @@ free_out:
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
 
-static __cpuinit void threshold_remove_device(unsigned int cpu)
+static void threshold_remove_device(unsigned int cpu)
 {
 	unsigned int bank;
 
@@ -629,14 +629,8 @@ static __cpuinit void threshold_remove_device(unsigned int cpu)
 	}
 }
 
-#else /* !CONFIG_HOTPLUG_CPU */
-static void threshold_remove_device(unsigned int cpu)
-{
-}
-#endif
-
 /* get notified when a cpu comes on/off */
-static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
+static int threshold_cpu_callback(struct notifier_block *nfb,
 					    unsigned long action, void *hcpu)
 {
 	/* cpu was unsigned int to begin with */
@@ -659,9 +653,10 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
+static struct notifier_block threshold_cpu_notifier = {
 	.notifier_call = threshold_cpu_callback,
 };
+#endif /* CONFIG_HOTPLUG_CPU */
 
 static __init int threshold_init_device(void)
 {
@@ -673,7 +668,7 @@ static __init int threshold_init_device(void)
 		if (err)
 			return err;
 	}
-	register_cpu_notifier(&threshold_cpu_notifier);
+	register_hotcpu_notifier(&threshold_cpu_notifier);
 	return 0;
 }
 
-- 
cgit v1.1


From 2699500b31f41fc25656c42548c8a388c8a329fe Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 Aug 2006 22:37:28 +0200
Subject: [PATCH] x86_64: Fix backtracing for interrupt stacks

Re-add backlink for old style unwinder to stack switching.  Add proper
stack frame and CFI annotations to call_softirq

This prevents a oops when backtracing with fallback through the
interrupt stack top.

Suggested by Jan Beulich and Herbert Xu wanted it in 2.6.18.

Cc: jbeulich@novell.com
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index d464dde..6f81042 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -513,6 +513,7 @@ END(stub_rt_sigreturn)
 	swapgs	
 1:	incl	%gs:pda_irqcount	# RED-PEN should check preempt count
 	cmoveq %gs:pda_irqstackptr,%rsp
+	push    %rbp			# backlink for old unwinder
 	/*
 	 * We entered an interrupt context - irqs are off:
 	 */
@@ -1139,18 +1140,21 @@ ENTRY(machine_check)
 END(machine_check)
 #endif
 
+/* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(call_softirq)
 	CFI_STARTPROC
-	movq %gs:pda_irqstackptr,%rax
-	movq %rsp,%rdx
-	CFI_DEF_CFA_REGISTER	rdx
+	push %rbp
+	CFI_ADJUST_CFA_OFFSET	8
+	CFI_REL_OFFSET rbp,0
+	mov  %rsp,%rbp
+	CFI_DEF_CFA_REGISTER rbp
 	incl %gs:pda_irqcount
-	cmove %rax,%rsp
-	pushq %rdx
-	/*todo CFI_DEF_CFA_EXPRESSION ...*/
+	cmove %gs:pda_irqstackptr,%rsp
+	push  %rbp			# backlink for old unwinder
 	call __do_softirq
-	popq %rsp
+	leaveq
 	CFI_DEF_CFA_REGISTER	rsp
+	CFI_ADJUST_CFA_OFFSET   -8
 	decl %gs:pda_irqcount
 	ret
 	CFI_ENDPROC
-- 
cgit v1.1


From a166222cde740b34d97fe49dca70348197f4534e Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Wed, 2 Aug 2006 22:37:31 +0200
Subject: [PATCH] x86_64: Fix CONFIG_IOMMU_DEBUG

If CONFIG_IOMMU_DEBUG is set force_iommu defaults to 1. In the case
where no HW IOMMU is present in the machine and we end up using nommu,
leaving force_iommu set to 1 causes dma_alloc_coherent to do the wrong
thing. Therefore, if we end up using nommu, make sure force_iommu is
0.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/pci-nommu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index c4c3cc3..aad7609 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -92,5 +92,7 @@ void __init no_iommu_init(void)
 {
 	if (dma_ops)
 		return;
+
+	force_iommu = 0; /* no HW IOMMU */
 	dma_ops = &nommu_dma_ops;
 }
-- 
cgit v1.1


From 825e037fb8912f38e9d9ddd3ec79fa7c584db708 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 5 Aug 2006 12:14:34 -0700
Subject: [PATCH] Fix more per-cpu typos

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 5a1c0a3..06af6ca 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -203,7 +203,7 @@ int __cpuinit init_smp_flush(void)
 {
 	int i;
 	for_each_cpu_mask(i, cpu_possible_map) {
-		spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
+		spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
 	}
 	return 0;
 }
-- 
cgit v1.1


From 012c437d03cb299814e58ac8d574f7510f5989a5 Mon Sep 17 00:00:00 2001
From: Horms <horms@verge.net.au>
Date: Sun, 13 Aug 2006 23:24:22 -0700
Subject: [PATCH] Change panic_on_oops message to "Fatal exception"

Previously the message was "Fatal exception: panic_on_oops", as introduced
in a recent patch whith removed a somewhat dangerous call to ssleep() in
the panic_on_oops path.  However, Paul Mackerras suggested that this was
somewhat confusing, leadind people to believe that it was panic_on_oops
that was the root cause of the fatal exception.  On his suggestion, this
patch changes the message to simply "Fatal exception".  A suitable oops
message should already have been displayed.

Signed-off-by: Simon Horman <horms@verge.net.au>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86_64/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 4e9938d..14052f0 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -529,7 +529,7 @@ void __kprobes oops_end(unsigned long flags)
 		/* Nest count reaches zero, release the lock. */
 		spin_unlock_irqrestore(&die_lock, flags);
 	if (panic_on_oops)
-		panic("Fatal exception: panic_on_oops");
+		panic("Fatal exception");
 }
 
 void __kprobes __die(const char * str, struct pt_regs * regs, long err)
-- 
cgit v1.1


From 11012d419cfc0e0f78ca356aca03674217910124 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Aug 2006 19:37:06 +0200
Subject: [PATCH] x86: Revert e820 MCFG heuristics

The check for the MCFG table being reserved in the e820 map was originally
added to detect a broken BIOS in a preproduction Intel SDV. However it also
breaks the Apple x86 Macs, which can't supply this properly, but need
a working MCFG. With this patch they wouldn't use the MCFG and not work.

After some discussion I think it's best to remove the heuristic again.
It also failed on some other boxes (although it didn't cause much
problems there because old style port access for PCI config space
still works as fallback), but the preproduction SDVs can just use
pci=nommcfg. Supporting production machines properly is more
important.

Edgar Hucek did all the debugging work.

Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Edgar Hucek <hostmaster@ed-soft.at>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index e56c2ad..d81e3d6 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -104,35 +104,6 @@ e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
 	return 0;
 }
 
-/*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
-{
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full coverage */
-		if (start >= end)
-			return 1; /* we're done */
-	}
-	return 0;
-}
-
 /* 
  * Find a free area in a specific range. 
  */ 
-- 
cgit v1.1


From c05991ed12fd71e539dd8de8f5663450cd0c934c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Aug 2006 19:37:08 +0200
Subject: [PATCH] x86_64: Add kernel thread stack frame termination for
 properly stopping stack unwinds.

One open question: Should these added pushes perhaps be made
conditional upon CONFIG_STACK_UNWIND or CONFIG_UNWIND_INFO?
[AK: Not needed -- these are all very slow paths]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S | 3 +++
 arch/x86_64/kernel/head.S  | 1 +
 2 files changed, 4 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 6f81042..aa8d893 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -973,6 +973,8 @@ ENTRY(kernel_thread)
 ENDPROC(kernel_thread)
 	
 child_rip:
+	pushq $0		# fake return address
+	CFI_STARTPROC
 	/*
 	 * Here we are in the child and the registers are set as they were
 	 * at kernel_thread() invocation in the parent.
@@ -983,6 +985,7 @@ child_rip:
 	# exit
 	xorl %edi, %edi
 	call do_exit
+	CFI_ENDPROC
 ENDPROC(child_rip)
 
 /*
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 6df05e6..c9739ca 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -191,6 +191,7 @@ startup_64:
 	 * jump
 	 */
 	movq	initial_code(%rip),%rax
+	pushq	$0		# fake return address
 	jmp	*%rax
 
 	/* SMP bootup changes these two */
-- 
cgit v1.1


From ea424055b771a165c9abd3ae109255a3b825c745 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Aug 2006 19:37:11 +0200
Subject: [PATCH] x86: Make backtracer fallback logic more bullet-proof

The unwinder fallback logic still had potential for falling through to
the legacy stack trace code without printing an indication (at once
serving as a separator) of this.

Further, the stack pointer retrieval for the fallback should be as
restrictive as possible (in order to avoid having the legacy stack
tracer try to access invalid memory). The patch tightens that, but
this could certainly be further improved.

Also making the call_trace command line option now conditional upon
CONFIG_STACK_UNWIND (as it's meaningless otherwise).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 14052f0..5e00af5 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -107,7 +107,11 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 }
 
 static int kstack_depth_to_print = 12;
+#ifdef CONFIG_STACK_UNWIND
 static int call_trace = 1;
+#else
+#define call_trace (-1)
+#endif
 
 #ifdef CONFIG_KALLSYMS
 # include <linux/kallsyms.h>
@@ -274,21 +278,21 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			if (unwind_init_blocked(&info, tsk) == 0)
 				unw_ret = show_trace_unwind(&info, NULL);
 		}
-		if (unw_ret > 0 && !arch_unw_user_mode(&info)) {
-#ifdef CONFIG_STACK_UNWIND
-			unsigned long rip = info.regs.rip;
-			print_symbol("DWARF2 unwinder stuck at %s\n", rip);
-			if (call_trace == 1) {
-				printk("Leftover inexact backtrace:\n");
-				stack = (unsigned long *)info.regs.rsp;
-			} else if (call_trace > 1)
+		if (unw_ret > 0) {
+			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
+				print_symbol("DWARF2 unwinder stuck at %s\n",
+					     UNW_PC(&info));
+				if ((long)UNW_SP(&info) < 0) {
+					printk("Leftover inexact backtrace:\n");
+					stack = (unsigned long *)UNW_SP(&info);
+				} else
+					printk("Full inexact backtrace again:\n");
+			} else if (call_trace >= 1)
 				return;
 			else
 				printk("Full inexact backtrace again:\n");
-#else
+		} else
 			printk("Inexact backtrace:\n");
-#endif
-		}
 	}
 
 	/*
@@ -1120,6 +1124,7 @@ static int __init kstack_setup(char *s)
 }
 __setup("kstack=", kstack_setup);
 
+#ifdef CONFIG_STACK_UNWIND
 static int __init call_trace_setup(char *s)
 {
 	if (strcmp(s, "old") == 0)
@@ -1133,3 +1138,4 @@ static int __init call_trace_setup(char *s)
 	return 1;
 }
 __setup("call_trace=", call_trace_setup);
+#endif
-- 
cgit v1.1


From ceee88223047749ad683d397b19904c3dfb6adeb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Aug 2006 19:37:12 +0200
Subject: [PATCH] x86_64: Recover 1MB of kernel memory

Noticed by Jan Beulich.

When the kernel was moved from 1MB to 2MB in 2.6.17 the kernel reservation
code wasn't adjusted and it still reserved starting with 1MB. This means 1MB always
were lost.

This patch fixes this by reserving only starting with _text.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c  | 6 +++++-
 arch/x86_64/kernel/setup.c | 6 ++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index d81e3d6..764bf23 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -71,7 +71,11 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
 #endif
 	/* kernel code + 640k memory hole (later should not be needed, but 
 	   be paranoid for now) */
-	if (last >= 640*1024 && addr < __pa_symbol(&_end)) { 
+	if (last >= 640*1024 && addr < 1024*1024) {
+		*addrp = 1024*1024;
+		return 1;
+	}
+	if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
 		*addrp = __pa_symbol(&_end);
 		return 1;
 	}
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 8a099ff..34afad7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -521,8 +521,6 @@ static void discover_ebda(void)
 
 void __init setup_arch(char **cmdline_p)
 {
-	unsigned long kernel_end;
-
  	ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
  	screen_info = SCREEN_INFO;
 	edid_info = EDID_INFO;
@@ -596,8 +594,8 @@ void __init setup_arch(char **cmdline_p)
 				(table_end - table_start) << PAGE_SHIFT);
 
 	/* reserve kernel */
-	kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
-	reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
+	reserve_bootmem_generic(__pa_symbol(&_text),
+				__pa_symbol(&_end) - __pa_symbol(&_text));
 
 	/*
 	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-- 
cgit v1.1


From 01ebb77b31149d847726a8847ad0d37631d7f049 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@ocs.com.au>
Date: Wed, 30 Aug 2006 19:37:19 +0200
Subject: [PATCH] x86_64: Save original IST values for checking stack addresses

The values in init_tss.ist[] can change when an IST event occurs.  Save
the original IST values for checking stack addresses when debugging or
doing stack traces.

Signed-off-by: Keith Owens <kaos@ocs.com.au>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/init_task.c | 5 +++++
 arch/x86_64/kernel/setup64.c   | 3 ++-
 arch/x86_64/kernel/traps.c     | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index ce31d90..3dc5854 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -46,4 +46,9 @@ EXPORT_SYMBOL(init_task);
  */ 
 DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
 
+/* Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
 #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 6fe58a6..417de56 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -189,6 +189,7 @@ void __cpuinit cpu_init (void)
 {
 	int cpu = stack_smp_processor_id();
 	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
 	unsigned long v; 
 	char *estacks = NULL; 
 	struct task_struct *me;
@@ -256,7 +257,7 @@ void __cpuinit cpu_init (void)
 			estacks += EXCEPTION_STKSZ;
 			break;
 		}
-		t->ist[v] = (unsigned long)estacks;
+		orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
 	}
 
 	t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 5e00af5..b124977 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -178,7 +178,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 			break;
 #endif
 		default:
-			end = per_cpu(init_tss, cpu).ist[k];
+			end = per_cpu(orig_ist, cpu).ist[k];
 			break;
 		}
 		/*
-- 
cgit v1.1


From dc104fb3231f11e95b5a0f09ae3ab27a8fd5b2e8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 31 Aug 2006 19:05:56 -0400
Subject: [PATCH] audit: more syscall classes added

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86_64/kernel/audit.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c
index a067aa4..36840ac 100644
--- a/arch/x86_64/kernel/audit.c
+++ b/arch/x86_64/kernel/audit.c
@@ -8,6 +8,16 @@ static unsigned dir_class[] = {
 ~0U
 };
 
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
 static unsigned chattr_class[] = {
 #include <asm-generic/audit_change_attr.h>
 ~0U
@@ -17,10 +27,16 @@ static int __init audit_classes_init(void)
 {
 #ifdef CONFIG_IA32_EMULATION
 	extern __u32 ia32_dir_class[];
+	extern __u32 ia32_write_class[];
+	extern __u32 ia32_read_class[];
 	extern __u32 ia32_chattr_class[];
+	audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
+	audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
 	audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
 	audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
 #endif
+	audit_register_class(AUDIT_CLASS_WRITE, write_class);
+	audit_register_class(AUDIT_CLASS_READ, read_class);
 	audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
 	audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
 	return 0;
-- 
cgit v1.1


From 55669bfa141b488be865341ed12e188967d11308 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 31 Aug 2006 19:26:40 -0400
Subject: [PATCH] audit: AUDIT_PERM support

add support for AUDIT_PERM predicate

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86_64/kernel/audit.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c
index 36840ac..21f3338 100644
--- a/arch/x86_64/kernel/audit.c
+++ b/arch/x86_64/kernel/audit.c
@@ -23,6 +23,25 @@ static unsigned chattr_class[] = {
 ~0U
 };
 
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+#ifdef CONFIG_IA32_EMULATION
+	extern int ia32_classify_syscall(unsigned);
+	if (abi == AUDIT_ARCH_I386)
+		return ia32_classify_syscall(syscall);
+#endif
+	switch(syscall) {
+	case __NR_open:
+		return 2;
+	case __NR_openat:
+		return 3;
+	case __NR_execve:
+		return 5;
+	default:
+		return 0;
+	}
+}
+
 static int __init audit_classes_init(void)
 {
 #ifdef CONFIG_IA32_EMULATION
-- 
cgit v1.1