From 3ac94932a2c859e8c57a8af091fa39334e1c3f23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:36 -0700 Subject: [PATCH] lockdep: beautify x86_64 stacktraces Beautify x86_64 stacktraces to be more readable. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/process.c | 2 +- arch/x86_64/kernel/traps.c | 70 ++++++++++++++++++++------------------------ 2 files changed, 32 insertions(+), 40 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index ca56e19..bb6745d 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -296,7 +296,7 @@ void __show_regs(struct pt_regs * regs) system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 5a5311d..02dc155 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -110,28 +110,31 @@ static int kstack_depth_to_print = 12; static int call_trace = 1; #ifdef CONFIG_KALLSYMS -#include -int printk_address(unsigned long address) -{ +# include +void printk_address(unsigned long address) +{ unsigned long offset = 0, symsize; const char *symname; char *modname; - char *delim = ":"; + char *delim = ":"; char namebuf[128]; - symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); - if (!symname) - return printk("[<%016lx>]", address); - if (!modname) + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%016lx>]\n", address); + return; + } + if (!modname) modname = delim = ""; - return printk("<%016lx>{%s%s%s%s%+ld}", - address, delim, modname, delim, symname, offset); -} + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", + address, delim, modname, delim, symname, offset, symsize); +} #else -int printk_address(unsigned long address) -{ - return printk("[<%016lx>]", address); -} +void printk_address(unsigned long address) +{ + printk(" [<%016lx>]\n", address); +} #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, @@ -193,20 +196,14 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, static int show_trace_unwind(struct unwind_frame_info *info, void *context) { - int i = 11, n = 0; + int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { - ++n; - if (i > 50) { - printk("\n "); - i = 7; - } else - i += printk(" "); - i += printk_address(UNW_PC(info)); + n++; + printk_address(UNW_PC(info)); if (arch_unw_user_mode(info)) break; } - printk("\n"); return n; } @@ -224,7 +221,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s int i = 11; unsigned used = 0; - printk("\nCall Trace:"); + printk("\nCall Trace:\n"); if (!tsk) tsk = current; @@ -254,12 +251,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s do while (cond) { \ unsigned long addr = *stack++; \ if (kernel_text_address(addr)) { \ - if (i > 50) { \ - printk("\n "); \ - i = 0; \ - } \ - else \ - i += printk(" "); \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ @@ -268,20 +259,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - i += printk_address(addr); \ + printk_address(addr); \ } \ } while (0) - for(; ; ) { + for( ; ; ) { const char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - i += printk(" <%s>", id); + printk(" <%s>", id); HANDLE_STACK (stack < estack_end); - i += printk(" "); + printk(" "); stack = (unsigned long *) estack_end[-2]; continue; } @@ -291,11 +282,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - i += printk(" "); + printk(" "); HANDLE_STACK (stack < irqstack_end); stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - i += printk(" "); + printk(" "); continue; } } @@ -304,6 +295,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK + printk("\n"); } @@ -337,8 +329,8 @@ static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned break; } if (i && ((i % 4) == 0)) - printk("\n "); - printk("%016lx ", *stack++); + printk("\n"); + printk(" %016lx", *stack++); touch_nmi_watchdog(); } show_trace(tsk, regs, rsp); -- cgit v1.1 From c9ca1ba5bde45839cdc4f8ab93730cb68e6ee8a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:36 -0700 Subject: [PATCH] lockdep: x86_64 document stack frame internals Document stack frame nesting internals some more. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 61 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 02dc155..79d05c4 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -152,10 +152,22 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, }; unsigned k; + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { unsigned long end; + /* + * set 'end' to the end of the exception stack. + */ switch (k + 1) { + /* + * TODO: this block is not needed i think, because + * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] + * properly too. + */ #if DEBUG_STKSZ > EXCEPTION_STKSZ case DEBUG_STACK: end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; @@ -165,19 +177,43 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, end = per_cpu(init_tss, cpu).ist[k]; break; } + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ if (stack >= end) continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ if (*usedp & (1U << k)) break; *usedp |= 1U << k; *idp = ids[k]; return (unsigned long *)end; } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ #if DEBUG_STKSZ > EXCEPTION_STKSZ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { unsigned j = N_EXCEPTION_STACKS - 1; + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ do { ++j; end -= EXCEPTION_STKSZ; @@ -247,6 +283,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s } } + /* + * Print function call entries within a stack. 'cond' is the + * "end of stackframe" condition, that the 'stack++' + * iteration will eventually trigger. + */ #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ @@ -263,7 +304,12 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s } \ } while (0) - for( ; ; ) { + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + for ( ; ; ) { const char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, @@ -273,6 +319,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s printk(" <%s>", id); HANDLE_STACK (stack < estack_end); printk(" "); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ stack = (unsigned long *) estack_end[-2]; continue; } @@ -284,6 +335,11 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (stack >= irqstack && stack < irqstack_end) { printk(" "); HANDLE_STACK (stack < irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; printk(" "); @@ -293,6 +349,9 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s break; } + /* + * This prints the process stack: + */ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK -- cgit v1.1 From 21b32bbff950771f196da91011249fa05fa83b32 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:40 -0700 Subject: [PATCH] lockdep: stacktrace subsystem, x86_64 support Framework to generate and save stacktraces quickly, without printing anything to the console. x86_64 support. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/Makefile | 1 + arch/x86_64/kernel/stacktrace.c | 221 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 arch/x86_64/kernel/stacktrace.c (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 819e84e..b5aaeaf 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -10,6 +10,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ pci-dma.o pci-nommu.o alternative.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c new file mode 100644 index 0000000..32cf55e --- /dev/null +++ b/arch/x86_64/kernel/stacktrace.c @@ -0,0 +1,221 @@ +/* + * arch/x86_64/kernel/stacktrace.c + * + * Stack trace management functions + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + */ +#include +#include + +#include + +static inline int +in_range(unsigned long start, unsigned long addr, unsigned long end) +{ + return addr >= start && addr <= end; +} + +static unsigned long +get_stack_end(struct task_struct *task, unsigned long stack) +{ + unsigned long stack_start, stack_end, flags; + int i, cpu; + + /* + * The most common case is that we are in the task stack: + */ + stack_start = (unsigned long)task->thread_info; + stack_end = stack_start + THREAD_SIZE; + + if (in_range(stack_start, stack, stack_end)) + return stack_end; + + /* + * We are in an interrupt if irqstackptr is set: + */ + raw_local_irq_save(flags); + cpu = safe_smp_processor_id(); + stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr; + + if (stack_end) { + stack_start = stack_end & ~(IRQSTACKSIZE-1); + if (in_range(stack_start, stack, stack_end)) + goto out_restore; + /* + * We get here if we are in an IRQ context but we + * are also in an exception stack. + */ + } + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (i = 0; i < N_EXCEPTION_STACKS; i++) { + /* + * set 'end' to the end of the exception stack. + */ + stack_end = per_cpu(init_tss, cpu).ist[i]; + stack_start = stack_end - EXCEPTION_STKSZ; + + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= stack_end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= stack_start) + goto out_restore; + + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) { + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + stack_end -= EXCEPTION_STKSZ; + stack_start -= EXCEPTION_STKSZ; + } while (stack < stack_start); + + goto out_restore; + } +#endif + } + /* + * Ok, 'stack' is not pointing to any of the system stacks. + */ + stack_end = 0; + +out_restore: + raw_local_irq_restore(flags); + + return stack_end; +} + + +/* + * Save stack-backtrace addresses into a stack_trace buffer: + */ +static inline unsigned long +save_context_stack(struct stack_trace *trace, unsigned int skip, + unsigned long stack, unsigned long stack_end) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + unsigned long prev_stack = 0; + + while (in_range(prev_stack, stack, stack_end)) { + pr_debug("stack: %p\n", (void *)stack); + addr = (unsigned long)(((unsigned long *)stack)[1]); + pr_debug("addr: %p\n", (void *)addr); + if (!skip) + trace->entries[trace->nr_entries++] = addr-1; + else + skip--; + if (trace->nr_entries >= trace->max_entries) + break; + if (!addr) + return 0; + /* + * Stack frames must go forwards (otherwise a loop could + * happen if the stackframe is corrupted), so we move + * prev_stack forwards: + */ + prev_stack = stack; + stack = (unsigned long)(((unsigned long *)stack)[0]); + } + pr_debug("invalid: %p\n", (void *)stack); +#else + while (stack < stack_end) { + addr = ((unsigned long *)stack)[0]; + stack += sizeof(long); + if (__kernel_text_address(addr)) { + if (!skip) + trace->entries[trace->nr_entries++] = addr-1; + else + skip--; + if (trace->nr_entries >= trace->max_entries) + break; + } + } +#endif + return stack; +} + +#define MAX_STACKS 10 + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + * If all_contexts is set, all contexts (hardirq, softirq and process) + * are saved. If not set then only the current context is saved. + */ +void save_stack_trace(struct stack_trace *trace, + struct task_struct *task, int all_contexts, + unsigned int skip) +{ + unsigned long stack = (unsigned long)&stack; + int i, nr_stacks = 0, stacks_done[MAX_STACKS]; + + WARN_ON(trace->nr_entries || !trace->max_entries); + + if (!task) + task = current; + + pr_debug("task: %p, ti: %p\n", task, task->thread_info); + + if (!task || task == current) { + /* Grab rbp right from our regs: */ + asm ("mov %%rbp, %0" : "=r" (stack)); + pr_debug("rbp: %p\n", (void *)stack); + } else { + /* rbp is the last reg pushed by switch_to(): */ + stack = task->thread.rsp; + pr_debug("other task rsp: %p\n", (void *)stack); + stack = (unsigned long)(((unsigned long *)stack)[0]); + pr_debug("other task rbp: %p\n", (void *)stack); + } + + while (1) { + unsigned long stack_end = get_stack_end(task, stack); + + pr_debug("stack: %p\n", (void *)stack); + pr_debug("stack end: %p\n", (void *)stack_end); + + /* + * Invalid stack addres? + */ + if (!stack_end) + return; + /* + * Were we in this stack already? (recursion) + */ + for (i = 0; i < nr_stacks; i++) + if (stacks_done[i] == stack_end) + return; + stacks_done[nr_stacks] = stack_end; + + stack = save_context_stack(trace, skip, stack, stack_end); + if (!all_contexts || !stack || + trace->nr_entries >= trace->max_entries) + return; + trace->entries[trace->nr_entries++] = ULONG_MAX; + if (trace->nr_entries >= trace->max_entries) + return; + if (++nr_stacks >= MAX_STACKS) + return; + } +} + -- cgit v1.1 From 2601e64d262ee5ed4d4a5737345803800d9c4db3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:45 -0700 Subject: [PATCH] lockdep: irqtrace subsystem, x86_64 support Add irqflags-tracing support to x86_64. [akpm@osdl.org: build fix] Signed-off-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/entry.S | 188 ++++++++++++++++++++++++++++++++------------- arch/x86_64/kernel/irq.c | 4 +- 2 files changed, 139 insertions(+), 53 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ed92c29..d464dde 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -41,13 +41,24 @@ #include #include #include +#include .code64 #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif - + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + /* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based @@ -194,6 +205,10 @@ ENTRY(system_call) swapgs movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp + /* + * No need to follow this irqs off/on section - it's straight + * and short: + */ sti SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) @@ -219,10 +234,15 @@ ret_from_sys_call: sysret_check: GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz sysret_careful + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 @@ -237,6 +257,7 @@ sysret_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -247,6 +268,7 @@ sysret_careful: /* Handle a signal */ sysret_signal: + TRACE_IRQS_ON sti testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz 1f @@ -261,6 +283,7 @@ sysret_signal: /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli + TRACE_IRQS_OFF jmp int_with_check badsys: @@ -309,6 +332,7 @@ ENTRY(int_ret_from_sys_call) CFI_REL_OFFSET r10,R10-ARGOFFSET CFI_REL_OFFSET r11,R11-ARGOFFSET cli + TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi @@ -327,6 +351,7 @@ int_with_check: int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -334,10 +359,12 @@ int_careful: popq %rdi CFI_ADJUST_CFA_OFFSET -8 cli + TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: + TRACE_IRQS_ON sti SAVE_REST /* Check for syscall exit trace */ @@ -351,6 +378,7 @@ int_very_careful: CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi cli + TRACE_IRQS_OFF jmp int_restore_rest int_signal: @@ -363,6 +391,7 @@ int_signal: int_restore_rest: RESTORE_REST cli + TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC END(int_ret_from_sys_call) @@ -484,6 +513,10 @@ END(stub_rt_sigreturn) swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count cmoveq %gs:pda_irqstackptr,%rsp + /* + * We entered an interrupt context - irqs are off: + */ + TRACE_IRQS_OFF call \func .endm @@ -493,6 +526,7 @@ ENTRY(common_interrupt) /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: cli + TRACE_IRQS_OFF decl %gs:pda_irqcount leaveq CFI_DEF_CFA_REGISTER rsp @@ -515,9 +549,21 @@ retint_check: CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: + /* + * The iretq could re-enable interrupts: + */ + cli + TRACE_IRQS_IRETQ swapgs + jmp restore_args + retint_restore_args: cli + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ +restore_args: RESTORE_ARGS 0,8,0 iret_label: iretq @@ -530,6 +576,7 @@ iret_label: /* running with kernel gs */ bad_iret: movq $11,%rdi /* SIGSEGV */ + TRACE_IRQS_ON sti jmp do_exit .previous @@ -539,6 +586,7 @@ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -547,11 +595,13 @@ retint_careful: CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF jmp retint_check retint_signal: testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz retint_swapgs + TRACE_IRQS_ON sti SAVE_REST movq $-1,ORIG_RAX(%rsp) @@ -560,6 +610,7 @@ retint_signal: call do_notify_resume RESTORE_REST cli + TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) jmp retint_check @@ -666,7 +717,7 @@ END(spurious_interrupt) /* error code is on the stack already */ /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0 + .macro paranoidentry sym, ist=0, irqtrace=1 SAVE_ALL cld movl $1,%ebx @@ -691,8 +742,73 @@ END(spurious_interrupt) addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif cli + .if \irqtrace + TRACE_IRQS_OFF + .endif .endm - + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + .macro paranoidexit trace=1 + /* ebx: no swapgs flag */ +paranoid_exit\trace: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore\trace + testl $3,CS(%rsp) + jnz paranoid_userspace\trace +paranoid_swapgs\trace: + TRACE_IRQS_IRETQ 0 + swapgs +paranoid_restore\trace: + RESTORE_ALL 8 + iretq +paranoid_userspace\trace: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs\trace + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule\trace + movl %ebx,%edx /* arg3: thread flags */ + .if \trace + TRACE_IRQS_ON + .endif + sti + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace +paranoid_schedule\trace: + .if \trace + TRACE_IRQS_ON + .endif + sti + call schedule + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace + CFI_ENDPROC + .endm + /* * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. @@ -748,6 +864,7 @@ error_exit: movl %ebx,%eax RESTORE_REST cli + TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax jne retint_kernel @@ -755,6 +872,10 @@ error_exit: movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful + /* + * The iret might restore flags: + */ + TRACE_IRQS_IRETQ swapgs RESTORE_ARGS 0,8,0 jmp iret_label @@ -916,8 +1037,7 @@ KPROBE_ENTRY(debug) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK - jmp paranoid_exit - CFI_ENDPROC + paranoidexit END(debug) .previous .text @@ -926,49 +1046,13 @@ KPROBE_ENTRY(nmi) INTR_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - */ - /* ebx: no swapgs flag */ -paranoid_exit: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: - swapgs -paranoid_restore: - RESTORE_ALL 8 - iretq -paranoid_userspace: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule - movl %ebx,%edx /* arg3: thread flags */ - sti - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - cli - jmp paranoid_userspace -paranoid_schedule: - sti - call schedule - cli - jmp paranoid_userspace - CFI_ENDPROC + paranoidentry do_nmi, 0, 0 +#ifdef CONFIG_TRACE_IRQFLAGS + paranoidexit 0 +#else + jmp paranoid_exit1 + CFI_ENDPROC +#endif END(nmi) .previous .text @@ -977,7 +1061,7 @@ KPROBE_ENTRY(int3) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(int3) .previous .text @@ -1006,7 +1090,7 @@ END(reserved) ENTRY(double_fault) XCPT_FRAME paranoidentry do_double_fault - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(double_fault) @@ -1022,7 +1106,7 @@ END(segment_not_present) ENTRY(stack_segment) XCPT_FRAME paranoidentry do_stack_segment - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(stack_segment) @@ -1050,7 +1134,7 @@ ENTRY(machine_check) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC END(machine_check) #endif diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index a1f1df5..5221a53 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -177,8 +177,10 @@ asmlinkage void do_softirq(void) local_irq_save(flags); pending = local_softirq_pending(); /* Switch to interrupt stack */ - if (pending) + if (pending) { call_softirq(); + WARN_ON_ONCE(softirq_count()); + } local_irq_restore(flags); } EXPORT_SYMBOL(do_softirq); -- cgit v1.1 From 2148270cd2ebe0d05e4289b7c77b1435c45481bf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:57 -0700 Subject: [PATCH] lockdep: x86_64 early init x86_64 uses spinlocks very early - earlier than start_kernel(). So call lockdep_init() from the arch setup code. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/head64.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index e6a71c9..36647ce 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -85,6 +85,11 @@ void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); /* + * This must be called really, really early: + */ + lockdep_init(); + + /* * switch to init_level4_pgt from boot_level4_pgt */ memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); -- cgit v1.1 From 366c7f554e888e51b8395f9b07b273fe775c7ff3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:25:25 -0700 Subject: [PATCH] lockdep: annotate enable_in_hardirq() Make use of local_irq_enable_in_hardirq() API to annotate places that enable hardirqs in hardirq context. Has no effect on non-lockdep kernels. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/nmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 476c1472..5baa0c7 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -127,7 +127,7 @@ void __cpuinit nmi_watchdog_default(void) static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; - local_irq_enable(); + local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the -- cgit v1.1 From 60be6b9a41cb0da0df7a9f11486da56baebf04cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:25:26 -0700 Subject: [PATCH] lockdep: annotate on-stack completions lockdep needs to have the waitqueue lock initialized for on-stack waitqueues implicitly initialized by DECLARE_COMPLETION(). Annotate on-stack completions accordingly. Has no effect on non-lockdep kernels. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smpboot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 9705a6a..b7c7059 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -775,6 +775,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) }; DECLARE_WORK(work, do_fork_idle, &c_idle); + lockdep_set_class(&c_idle.done.wait.lock, &waitqueue_lock_key); + /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { -- cgit v1.1 From 1a91023a9f172f820e292f2c5675fb9f8e2636f0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Jul 2006 04:43:49 -0700 Subject: [PATCH] x86_64: e820.c needs pgtable.h arch/x86_64/kernel/e820.c:42: error: 'MAXMEM' undeclared here (not in a function) Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/e820.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index b8eee4c..e56c2ad 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include -- cgit v1.1 From f86bf9b7bcc5d325687a8b80da8ee3eb56e02da7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Jul 2006 04:44:05 -0700 Subject: [PATCH] lockdep: clean up completion initializer in smpboot.c Clean up lockdep on-stack-completion initializer. (This also removes the dependency on waitqueue_lock_key.) Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smpboot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index b7c7059..9753802 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -771,12 +771,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) unsigned long start_rip; struct create_idle c_idle = { .cpu = cpu, - .done = COMPLETION_INITIALIZER(c_idle.done), + .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; DECLARE_WORK(work, do_fork_idle, &c_idle); - lockdep_set_class(&c_idle.done.wait.lock, &waitqueue_lock_key); - /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { -- cgit v1.1 From 1454aed92b6b89cb1fbe3cbecd8ceaa7a122f3b7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 10 Jul 2006 04:44:05 -0700 Subject: [PATCH] put a comment at register_die_notifier that the export is used {un}register_die_notifier() is used by kdb... document this so that future "remove dead export" rounds can skip this export. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 79d05c4..eb39a27 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -76,13 +76,13 @@ int register_die_notifier(struct notifier_block *nb) vmalloc_sync_all(); return atomic_notifier_chain_register(&die_chain, nb); } -EXPORT_SYMBOL(register_die_notifier); +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } -EXPORT_SYMBOL(unregister_die_notifier); +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ static inline void conditional_sti(struct pt_regs *regs) { -- cgit v1.1 From 894673ee6122a3ce1958e1fe096901ba5356a96b Mon Sep 17 00:00:00 2001 From: Jon Smirl Date: Mon, 10 Jul 2006 04:44:13 -0700 Subject: [PATCH] tty: Remove include of screen_info.h from tty.h screen_info.h doesn't have anything to do with the tty layer and shouldn't be included by tty.h. This patches removes the include and modifies all users to directly include screen_info.h. struct screen_info is mainly used to communicate with the console drivers in drivers/video/console. Note that this patch touches every arch and I have no way of testing it. If there is a mistake the worst thing that will happen is a compile error. [akpm@osdl.org: fix arm build] [akpm@osdl.org: fix alpha build] Signed-off-by: Jon Smirl Signed-off-by: Antonino Daplas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/early_printk.c | 2 +- arch/x86_64/kernel/setup.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index b93ef5b..140051e 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0925518..8a099ff 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.1 From 0d2caebd562a20188a0d7f4e4f516b7ed69f319e Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Mon, 10 Jul 2006 17:06:09 +0200 Subject: [PATCH] x86_64: Fix hotplug problem in mce amd Signed-off-by: Jacob Shin Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 335200a..db2acbf 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -597,7 +597,7 @@ static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; + per_cpu(threshold_banks, cpu)[bank] = NULL; return; } -- cgit v1.1 From aa0a9f373e3edb2c090f3fa0eb292712cfa97f81 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Mon, 10 Jul 2006 17:06:15 +0200 Subject: [PATCH] x86_64: Fix Calgary copyright statements per IBM guidelines Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-calgary.c | 6 ++++-- arch/x86_64/kernel/tce.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index d91cb843..e71ed53 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -1,9 +1,11 @@ /* * Derived from arch/powerpc/kernel/iommu.c * - * Copyright (C) 2006 Jon Mason , IBM Corporation - * Copyright (C) 2006 Muli Ben-Yehuda , IBM Corporation + * Copyright (C) IBM Corporation, 2006 * + * Author: Jon Mason + * Author: Muli Ben-Yehuda + * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 8d4c67f..d3a9e79 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -1,8 +1,10 @@ /* * Derived from arch/powerpc/platforms/pseries/iommu.c * - * Copyright (C) 2006 Jon Mason , IBM Corporation - * Copyright (C) 2006 Muli Ben-Yehuda , IBM Corporation + * Copyright (C) IBM Corporation, 2006 + * + * Author: Jon Mason + * Author: Muli Ben-Yehuda * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by -- cgit v1.1 From d5a2601734bcc740ee78dc4cb0c56b5687da7bd9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:42 +0200 Subject: [PATCH] i386/x86-64: Add user_mode checks to profile_pc for oprofile Fixes a obscure user space triggerable crash during oprofiling. Oprofile calls profile_pc from NMIs even when user_mode(regs) is not true and the program counter is inside the kernel lock section. This opens a race - when a user program jumps to a kernel lock address and a NMI happens before the illegal page fault exception is raised and the program has a unmapped esp or ebp then the kernel could oops. NMIs have a higher priority than exceptions so that could happen. Add user_mode checks to i386/x86-64 profile_pc to prevent that. Cc: John Levon Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index b9ff759..e0341c6 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -193,7 +193,7 @@ unsigned long profile_pc(struct pt_regs *regs) is just accounted to the spinlock function. Better would be to write these functions in assembler again and check exactly. */ - if (in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { char *v = *(char **)regs->rsp; if ((v >= _stext && v <= _etext) || (v >= _sinittext && v <= _einittext) || -- cgit v1.1 From b13761ecd1d9977d2083da243e051e9f29097aef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:51 +0200 Subject: [PATCH] x86_64: Dump leftover backtrace entries when dwarf2 unwinder got stuck The dwarf2 unwinder currently often gets stuck because a lot of assembly code doesn't have proper dwarf2 annotiation yet. This currently often happens with __down. Should fix this by adding proper dwarf2 annotation to all inline assembly. However until that's done we need a quick fix for 2.6.18 to avoid incomplete backtraces. So when this happens dump the rest of the stack with the old unwinder instead of silently not dumping it. There was already a optional "both" mode that dumped both, but that was too ugly. I also clarified the headers for the different backtraces a bit. Also add a clear error message for missing dwarf2 annotation that people can work on. And I removed a dead variable left over from Ingo's changes. Cc: mingo@elte.hu Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index eb39a27..f7a9d14 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -254,7 +254,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - int i = 11; unsigned used = 0; printk("\nCall Trace:\n"); @@ -275,11 +274,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (unwind_init_blocked(&info, tsk) == 0) unw_ret = show_trace_unwind(&info, NULL); } - if (unw_ret > 0) { - if (call_trace > 0) + if (unw_ret > 0 && !arch_unw_user_mode(&info)) { +#ifdef CONFIG_STACK_UNWIND + unsigned long rip = info.regs.rip; + print_symbol("DWARF2 unwinder stuck at %s\n", rip); + if (call_trace == 1) { + printk("Leftover inexact backtrace:\n"); + stack = (unsigned long *)info.regs.rsp; + } else if (call_trace > 1) return; - printk("Legacy call trace:"); - i = 18; + else + printk("Full inexact backtrace again:\n"); +#else + printk("Inexact backtrace:\n"); +#endif } } @@ -1118,8 +1126,10 @@ static int __init call_trace_setup(char *s) call_trace = -1; else if (strcmp(s, "both") == 0) call_trace = 0; - else if (strcmp(s, "new") == 0) + else if (strcmp(s, "newfallback") == 0) call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; return 1; } __setup("call_trace=", call_trace_setup); -- cgit v1.1 From 0e5f61b00c577da698fb00cd9c91a96b79044dfd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:37 +0200 Subject: [PATCH] x86_64: On Intel systems when CPU has C3 don't use TSC On Intel systems generally the TSC stops in C3 or deeper, so don't use it there. Follows similar logic on i386. This should fix problems on Meroms. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/time.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index e0341c6..7a9b182 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -28,6 +28,7 @@ #include #ifdef CONFIG_ACPI #include /* for PM timer frequency */ +#include #endif #include #include @@ -953,11 +954,18 @@ __cpuinit int unsynchronized_tsc(void) #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; - /* Intel systems are normally all synchronized. Exceptions - are handled in the check above. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; #endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + return 1; +#endif + return 0; + } + /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; } -- cgit v1.1 From 089bbbcb36979166131868a89ca5f4e695d6637d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Sat, 29 Jul 2006 21:42:40 +0200 Subject: [PATCH] x86_64: Calgary IOMMU - fix off by one error Fixed off-by-one error in detect_calgary and calgary_init which will cause arrays to overflow. Also, removed impossible to hit BUG_ON. Signed-off-by: Jon Mason Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-calgary.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index e71ed53..92744ab 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -812,7 +812,7 @@ static int __init calgary_init(void) int i, ret = -ENODEV; struct pci_dev *dev = NULL; - for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) { + for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); @@ -890,9 +890,8 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0, table_idx = 0; - bus <= num_online_nodes() * MAX_PHB_BUS_NUM; + bus < num_online_nodes() * MAX_PHB_BUS_NUM; bus++) { - BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM); if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; if (test_bit(bus, translation_disabled)) { @@ -1002,7 +1001,7 @@ static int __init calgary_parse_options(char *p) if (p == endp) break; - if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) { + if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); set_bit(bridge, translation_disabled); -- cgit v1.1 From d2105b10fe0f460c388fe4e09226313f519d8c00 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Sat, 29 Jul 2006 21:42:43 +0200 Subject: [PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference fix Calgary hits a NULL pointer dereference when booting in a multi-chassis NUMA system. See Redhat bugzilla number 198498, found by Konrad Rzeszutek (konradr@redhat.com). There are many issues that had to be resolved to fix this problem. Firstly when I originally wrote the code to handle NUMA systems, I had a large misunderstanding that was not corrected until now. That was that I thought the "number of nodes online" referred to number of physical systems connected. So that if NUMA was disabled, there would only be 1 node and it would only show that node's PCI bus. In reality if NUMA is disabled, the system displays all of the connected chassis as one node but is only ignorant of the delays in accessing main memory. Therefore, references to num_online_nodes() and MAX_NUMNODES are incorrect and need to be set to the maximum number of nodes that can be accessed (which are 8). I created a variable, MAX_NUM_CHASSIS, and set it to 8 to fix this. Secondly, when walking the PCI in detect_calgary, the code only checked the first "slot" when looking to see if a device is present. This will work for most cases, but unfortunately it isn't always the case. In the NUMA MXE drawers, there are USB devices present on the 3rd slot (with slot 1 being empty). So, to work around this, all slots (up to 8) are scanned to see if there are any devices present. Lastly, the bus is being enumerated on large systems in a different way the we originally thought. This throws the ugly logic we had out the window. To more elegantly handle this, I reorganized the kva array to be sparse (which removed the need to have any bus number to kva slot logic in tce.c) and created a secondary space array to contain the bus number to phb mapping. With these changes Calgary boots on an x460 with 4 nodes with and without NUMA enabled. Signed-off-by: Jon Mason Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-calgary.c | 76 +++++++++++++++++++++++----------------- arch/x86_64/kernel/tce.c | 4 +-- 2 files changed, 45 insertions(+), 35 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 92744ab..146924b 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -85,7 +85,8 @@ #define CSR_AGENT_MASK 0xffe0ffff #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ +#define MAX_NUM_CHASSIS 8 /* max number of chassis */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ #define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ @@ -110,7 +111,8 @@ static const unsigned long phb_offsets[] = { 0xB000 /* PHB3 */ }; -void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; +static char bus_to_phb[MAX_PHB_BUS_NUM]; +void* tce_table_kva[MAX_PHB_BUS_NUM]; unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; @@ -119,7 +121,7 @@ static int calgary_detected __read_mostly = 0; * the bitmap of PHBs the user requested that we disable * translation on. */ -static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); +static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); static void tce_cache_blast(struct iommu_table *tbl); @@ -452,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = { static inline int busno_to_phbid(unsigned char num) { - return bus_to_phb(num) % PHBS_PER_CALGARY; + return bus_to_phb[num]; } static inline unsigned long split_queue_offset(unsigned char num) @@ -812,7 +814,7 @@ static int __init calgary_init(void) int i, ret = -ENODEV; struct pci_dev *dev = NULL; - for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { + for (i = 0; i < MAX_PHB_BUS_NUM; i++) { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); @@ -822,7 +824,7 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!tce_table_kva[i] && !translate_empty_slots) { + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { pci_dev_put(dev); continue; } @@ -842,7 +844,7 @@ error: pci_dev_put(dev); continue; } - if (!tce_table_kva[i] && !translate_empty_slots) + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) continue; calgary_disable_translation(dev); calgary_free_tar(dev); @@ -876,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram) void __init detect_calgary(void) { u32 val; - int bus, table_idx; + int bus; void *tbl; - int detected = 0; + int calgary_found = 0; + int phb = -1; /* * if the user specified iommu=off or iommu=soft or we found @@ -889,37 +892,46 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); - for (bus = 0, table_idx = 0; - bus < num_online_nodes() * MAX_PHB_BUS_NUM; - bus++) { + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + int dev; + + tce_table_kva[bus] = NULL; + bus_to_phb[bus] = -1; + if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; + + /* + * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3) + * it is connected to releative to the clagary chip. + */ + phb = (phb + 1) % PHBS_PER_CALGARY; + if (test_bit(bus, translation_disabled)) { printk(KERN_INFO "Calgary: translation is disabled for " "PHB 0x%x\n", bus); /* skip this phb, don't allocate a tbl for it */ - tce_table_kva[table_idx] = NULL; - table_idx++; continue; } /* - * scan the first slot of the PCI bus to see if there - * are any devices present + * Scan the slots of the PCI bus to see if there is a device present. + * The parent bus will be the zero-ith device, so start at 1. */ - val = read_pci_config(bus, 1, 0, 0); - if (val != 0xffffffff || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - detected = 1; - } else - tbl = NULL; - - tce_table_kva[table_idx] = tbl; - table_idx++; + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff || translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + tce_table_kva[bus] = tbl; + bus_to_phb[bus] = phb; + calgary_found = 1; + break; + } + } } - if (detected) { + if (calgary_found) { iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " @@ -928,9 +940,9 @@ void __init detect_calgary(void) return; cleanup: - for (--table_idx; table_idx >= 0; --table_idx) - if (tce_table_kva[table_idx]) - free_tce_table(tce_table_kva[table_idx]); + for (--bus; bus >= 0; --bus) + if (tce_table_kva[bus]) + free_tce_table(tce_table_kva[bus]); } int __init calgary_iommu_init(void) @@ -1001,7 +1013,7 @@ static int __init calgary_parse_options(char *p) if (p == endp) break; - if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { + if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); set_bit(bridge, translation_disabled); diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index d3a9e79..5530dda 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -96,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size) static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) { unsigned int bitmapsz; - unsigned int tce_table_index; unsigned long bmppages; int ret; @@ -105,8 +104,7 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) /* set the tce table size - measured in entries */ tbl->it_size = table_size_to_number_of_entries(specified_table_size); - tce_table_index = bus_to_phb(tbl->it_busno); - tbl->it_base = (unsigned long)tce_table_kva[tce_table_index]; + tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; if (!tbl->it_base) { printk(KERN_ERR "Calgary: iommu_table_setparms: " "no table allocated?!\n"); -- cgit v1.1 From 65f87d8a8a6e1b560c61951d0a68ed80f7c8ff19 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:49 +0200 Subject: [PATCH] x86_64: Fix swiotlb=force It was broken before. But having it is important as possible hardware bug workaround. And previously there was no way to force swiotlb if there is another IOMMU. Side effect is that iommu=force won't force swiotlb anymore even if there isn't another IOMMU. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-swiotlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index ebdb77f..6a55f87 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -31,9 +31,10 @@ struct dma_mapping_ops swiotlb_dma_ops = { void pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_detected && !no_iommu && - (end_pfn > MAX_DMA32_PFN || force_iommu)) + if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) swiotlb = 1; + if (swiotlb_force) + swiotlb = 1; if (swiotlb) { printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_init(); -- cgit v1.1 From 2a8a3d5b65e86ec1dfef7d268c64a909eab94af7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 30 Jul 2006 03:03:20 -0700 Subject: [PATCH] machine_kexec.c: Fix the description of segment handling One of my original comments in machine_kexec was unclear and this should fix it. Signed-off-by: Eric W. Biederman Cc: Andi Kleen Acked-by: Horms Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/machine_kexec.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 83fb24a..106076b 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -207,14 +207,11 @@ NORET_TYPE void machine_kexec(struct kimage *image) __flush_tlb(); - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again unless you set the segment to a different selector. - * - * The more common model are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. -- cgit v1.1 From cea6a4ba8acfba6f59cc9ed71e0d05cb770b9d9c Mon Sep 17 00:00:00 2001 From: Horms Date: Sun, 30 Jul 2006 03:03:34 -0700 Subject: [PATCH] panic_on_oops: remove ssleep() This patch is part of an effort to unify the panic_on_oops behaviour across all architectures that implement it. It was pointed out to me by Andi Kleen that if an oops has occured in interrupt context, then calling sleep() in the oops path will only cause a panic, and that it would be really better for it not to be in the path at all. This patch removes the ssleep() call and reworks the console message accordinly. I have a slght concern that the resulting console message is too long, feedback welcome. For powerpc it also unifies the 32bit and 64bit behaviour. Fror x86_64, this patch only updates the console message, as ssleep() is already not present. Signed-off-by: Horms Acked-by: Paul Mackerras Cc: Russell King Cc: "Luck, Tony" Cc: Andi Kleen Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index f7a9d14..4e9938d 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -529,7 +529,7 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Oops"); + panic("Fatal exception: panic_on_oops"); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) -- cgit v1.1 From be6b5a3505fa0cd54c3b5959a39293f47c648980 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Sun, 30 Jul 2006 03:03:37 -0700 Subject: [PATCH] cpu hotplug: use hotplug version of registration in late inits Use hotplug version of register_cpu_notifier in late init functions. Signed-off-by: Chandra Seetharaman Cc: "Luck, Tony" Cc: Martin Schwidefsky Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mce.c | 12 +++++------- arch/x86_64/kernel/mce_amd.c | 19 +++++++------------ 2 files changed, 12 insertions(+), 19 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 8884567..4e017fb 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -615,7 +615,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static __cpuinit void mce_remove_device(unsigned int cpu) +static void mce_remove_device(unsigned int cpu) { int i; @@ -626,10 +626,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); sysdev_unregister(&per_cpu(device_mce,cpu)); } -#endif /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -638,18 +637,17 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE: mce_create_device(cpu); break; -#ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: mce_remove_device(cpu); break; -#endif } return NOTIFY_OK; } -static struct notifier_block __cpuinitdata mce_cpu_notifier = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; +#endif static __init int mce_init_device(void) { @@ -664,7 +662,7 @@ static __init int mce_init_device(void) mce_create_device(i); } - register_cpu_notifier(&mce_cpu_notifier); + register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index db2acbf..883fe74 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -558,7 +558,7 @@ out: * of shared sysfs dir/files, and rest of the cores will be symlinked to it. */ -static __cpuinit void deallocate_threshold_block(unsigned int cpu, +static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { struct threshold_block *pos = NULL; @@ -578,7 +578,7 @@ static __cpuinit void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } -static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) +static void threshold_remove_bank(unsigned int cpu, int bank) { int i = 0; struct threshold_bank *b; @@ -618,7 +618,7 @@ free_out: per_cpu(threshold_banks, cpu)[bank] = NULL; } -static __cpuinit void threshold_remove_device(unsigned int cpu) +static void threshold_remove_device(unsigned int cpu) { unsigned int bank; @@ -629,14 +629,8 @@ static __cpuinit void threshold_remove_device(unsigned int cpu) } } -#else /* !CONFIG_HOTPLUG_CPU */ -static void threshold_remove_device(unsigned int cpu) -{ -} -#endif - /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, +static int threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ @@ -659,9 +653,10 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { +static struct notifier_block threshold_cpu_notifier = { .notifier_call = threshold_cpu_callback, }; +#endif /* CONFIG_HOTPLUG_CPU */ static __init int threshold_init_device(void) { @@ -673,7 +668,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_cpu_notifier(&threshold_cpu_notifier); + register_hotcpu_notifier(&threshold_cpu_notifier); return 0; } -- cgit v1.1 From 2699500b31f41fc25656c42548c8a388c8a329fe Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 Aug 2006 22:37:28 +0200 Subject: [PATCH] x86_64: Fix backtracing for interrupt stacks Re-add backlink for old style unwinder to stack switching. Add proper stack frame and CFI annotations to call_softirq This prevents a oops when backtracing with fallback through the interrupt stack top. Suggested by Jan Beulich and Herbert Xu wanted it in 2.6.18. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/entry.S | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index d464dde..6f81042 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -513,6 +513,7 @@ END(stub_rt_sigreturn) swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count cmoveq %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder /* * We entered an interrupt context - irqs are off: */ @@ -1139,18 +1140,21 @@ ENTRY(machine_check) END(machine_check) #endif +/* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - movq %gs:pda_irqstackptr,%rax - movq %rsp,%rdx - CFI_DEF_CFA_REGISTER rdx + push %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp incl %gs:pda_irqcount - cmove %rax,%rsp - pushq %rdx - /*todo CFI_DEF_CFA_EXPRESSION ...*/ + cmove %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder call __do_softirq - popq %rsp + leaveq CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 decl %gs:pda_irqcount ret CFI_ENDPROC -- cgit v1.1 From a166222cde740b34d97fe49dca70348197f4534e Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Wed, 2 Aug 2006 22:37:31 +0200 Subject: [PATCH] x86_64: Fix CONFIG_IOMMU_DEBUG If CONFIG_IOMMU_DEBUG is set force_iommu defaults to 1. In the case where no HW IOMMU is present in the machine and we end up using nommu, leaving force_iommu set to 1 causes dma_alloc_coherent to do the wrong thing. Therefore, if we end up using nommu, make sure force_iommu is 0. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-nommu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index c4c3cc3..aad7609 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -92,5 +92,7 @@ void __init no_iommu_init(void) { if (dma_ops) return; + + force_iommu = 0; /* no HW IOMMU */ dma_ops = &nommu_dma_ops; } -- cgit v1.1 From 825e037fb8912f38e9d9ddd3ec79fa7c584db708 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 5 Aug 2006 12:14:34 -0700 Subject: [PATCH] Fix more per-cpu typos Signed-off-by: Alexey Dobriyan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 5a1c0a3..06af6ca 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -203,7 +203,7 @@ int __cpuinit init_smp_flush(void) { int i; for_each_cpu_mask(i, cpu_possible_map) { - spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i)); + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } -- cgit v1.1 From 012c437d03cb299814e58ac8d574f7510f5989a5 Mon Sep 17 00:00:00 2001 From: Horms Date: Sun, 13 Aug 2006 23:24:22 -0700 Subject: [PATCH] Change panic_on_oops message to "Fatal exception" Previously the message was "Fatal exception: panic_on_oops", as introduced in a recent patch whith removed a somewhat dangerous call to ssleep() in the panic_on_oops path. However, Paul Mackerras suggested that this was somewhat confusing, leadind people to believe that it was panic_on_oops that was the root cause of the fatal exception. On his suggestion, this patch changes the message to simply "Fatal exception". A suitable oops message should already have been displayed. Signed-off-by: Simon Horman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- arch/x86_64/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 4e9938d..14052f0 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -529,7 +529,7 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Fatal exception: panic_on_oops"); + panic("Fatal exception"); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) -- cgit v1.1 From 11012d419cfc0e0f78ca356aca03674217910124 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 30 Aug 2006 19:37:06 +0200 Subject: [PATCH] x86: Revert e820 MCFG heuristics The check for the MCFG table being reserved in the e820 map was originally added to detect a broken BIOS in a preproduction Intel SDV. However it also breaks the Apple x86 Macs, which can't supply this properly, but need a working MCFG. With this patch they wouldn't use the MCFG and not work. After some discussion I think it's best to remove the heuristic again. It also failed on some other boxes (although it didn't cause much problems there because old style port access for PCI config space still works as fallback), but the preproduction SDVs can just use pci=nommcfg. Supporting production machines properly is more important. Edgar Hucek did all the debugging work. Cc: Arjan van de Ven Cc: Edgar Hucek Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/e820.c | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index e56c2ad..d81e3d6 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -104,35 +104,6 @@ e820_any_mapped(unsigned long start, unsigned long end, unsigned type) return 0; } -/* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; -} - /* * Find a free area in a specific range. */ -- cgit v1.1 From c05991ed12fd71e539dd8de8f5663450cd0c934c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 30 Aug 2006 19:37:08 +0200 Subject: [PATCH] x86_64: Add kernel thread stack frame termination for properly stopping stack unwinds. One open question: Should these added pushes perhaps be made conditional upon CONFIG_STACK_UNWIND or CONFIG_UNWIND_INFO? [AK: Not needed -- these are all very slow paths] Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/entry.S | 3 +++ arch/x86_64/kernel/head.S | 1 + 2 files changed, 4 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 6f81042..aa8d893 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -973,6 +973,8 @@ ENTRY(kernel_thread) ENDPROC(kernel_thread) child_rip: + pushq $0 # fake return address + CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. @@ -983,6 +985,7 @@ child_rip: # exit xorl %edi, %edi call do_exit + CFI_ENDPROC ENDPROC(child_rip) /* diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 6df05e6..c9739ca 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -191,6 +191,7 @@ startup_64: * jump */ movq initial_code(%rip),%rax + pushq $0 # fake return address jmp *%rax /* SMP bootup changes these two */ -- cgit v1.1 From ea424055b771a165c9abd3ae109255a3b825c745 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 30 Aug 2006 19:37:11 +0200 Subject: [PATCH] x86: Make backtracer fallback logic more bullet-proof The unwinder fallback logic still had potential for falling through to the legacy stack trace code without printing an indication (at once serving as a separator) of this. Further, the stack pointer retrieval for the fallback should be as restrictive as possible (in order to avoid having the legacy stack tracer try to access invalid memory). The patch tightens that, but this could certainly be further improved. Also making the call_trace command line option now conditional upon CONFIG_STACK_UNWIND (as it's meaningless otherwise). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 14052f0..5e00af5 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -107,7 +107,11 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) } static int kstack_depth_to_print = 12; +#ifdef CONFIG_STACK_UNWIND static int call_trace = 1; +#else +#define call_trace (-1) +#endif #ifdef CONFIG_KALLSYMS # include @@ -274,21 +278,21 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (unwind_init_blocked(&info, tsk) == 0) unw_ret = show_trace_unwind(&info, NULL); } - if (unw_ret > 0 && !arch_unw_user_mode(&info)) { -#ifdef CONFIG_STACK_UNWIND - unsigned long rip = info.regs.rip; - print_symbol("DWARF2 unwinder stuck at %s\n", rip); - if (call_trace == 1) { - printk("Leftover inexact backtrace:\n"); - stack = (unsigned long *)info.regs.rsp; - } else if (call_trace > 1) + if (unw_ret > 0) { + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + print_symbol("DWARF2 unwinder stuck at %s\n", + UNW_PC(&info)); + if ((long)UNW_SP(&info) < 0) { + printk("Leftover inexact backtrace:\n"); + stack = (unsigned long *)UNW_SP(&info); + } else + printk("Full inexact backtrace again:\n"); + } else if (call_trace >= 1) return; else printk("Full inexact backtrace again:\n"); -#else + } else printk("Inexact backtrace:\n"); -#endif - } } /* @@ -1120,6 +1124,7 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); +#ifdef CONFIG_STACK_UNWIND static int __init call_trace_setup(char *s) { if (strcmp(s, "old") == 0) @@ -1133,3 +1138,4 @@ static int __init call_trace_setup(char *s) return 1; } __setup("call_trace=", call_trace_setup); +#endif -- cgit v1.1 From ceee88223047749ad683d397b19904c3dfb6adeb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 30 Aug 2006 19:37:12 +0200 Subject: [PATCH] x86_64: Recover 1MB of kernel memory Noticed by Jan Beulich. When the kernel was moved from 1MB to 2MB in 2.6.17 the kernel reservation code wasn't adjusted and it still reserved starting with 1MB. This means 1MB always were lost. This patch fixes this by reserving only starting with _text. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/e820.c | 6 +++++- arch/x86_64/kernel/setup.c | 6 ++---- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index d81e3d6..764bf23 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -71,7 +71,11 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) #endif /* kernel code + 640k memory hole (later should not be needed, but be paranoid for now) */ - if (last >= 640*1024 && addr < __pa_symbol(&_end)) { + if (last >= 640*1024 && addr < 1024*1024) { + *addrp = 1024*1024; + return 1; + } + if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { *addrp = __pa_symbol(&_end); return 1; } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 8a099ff..34afad7 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -521,8 +521,6 @@ static void discover_ebda(void) void __init setup_arch(char **cmdline_p) { - unsigned long kernel_end; - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; edid_info = EDID_INFO; @@ -596,8 +594,8 @@ void __init setup_arch(char **cmdline_p) (table_end - table_start) << PAGE_SHIFT); /* reserve kernel */ - kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); - reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); + reserve_bootmem_generic(__pa_symbol(&_text), + __pa_symbol(&_end) - __pa_symbol(&_text)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, -- cgit v1.1 From 01ebb77b31149d847726a8847ad0d37631d7f049 Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Wed, 30 Aug 2006 19:37:19 +0200 Subject: [PATCH] x86_64: Save original IST values for checking stack addresses The values in init_tss.ist[] can change when an IST event occurs. Save the original IST values for checking stack addresses when debugging or doing stack traces. Signed-off-by: Keith Owens Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/init_task.c | 5 +++++ arch/x86_64/kernel/setup64.c | 3 ++- arch/x86_64/kernel/traps.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c index ce31d90..3dc5854 100644 --- a/arch/x86_64/kernel/init_task.c +++ b/arch/x86_64/kernel/init_task.c @@ -46,4 +46,9 @@ EXPORT_SYMBOL(init_task); */ DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +/* Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 6fe58a6..417de56 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -189,6 +189,7 @@ void __cpuinit cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); unsigned long v; char *estacks = NULL; struct task_struct *me; @@ -256,7 +257,7 @@ void __cpuinit cpu_init (void) estacks += EXCEPTION_STKSZ; break; } - t->ist[v] = (unsigned long)estacks; + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; } t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 5e00af5..b124977 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -178,7 +178,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, break; #endif default: - end = per_cpu(init_tss, cpu).ist[k]; + end = per_cpu(orig_ist, cpu).ist[k]; break; } /* -- cgit v1.1 From dc104fb3231f11e95b5a0f09ae3ab27a8fd5b2e8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 31 Aug 2006 19:05:56 -0400 Subject: [PATCH] audit: more syscall classes added Signed-off-by: Al Viro --- arch/x86_64/kernel/audit.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c index a067aa4..36840ac 100644 --- a/arch/x86_64/kernel/audit.c +++ b/arch/x86_64/kernel/audit.c @@ -8,6 +8,16 @@ static unsigned dir_class[] = { ~0U }; +static unsigned read_class[] = { +#include +~0U +}; + +static unsigned write_class[] = { +#include +~0U +}; + static unsigned chattr_class[] = { #include ~0U @@ -17,10 +27,16 @@ static int __init audit_classes_init(void) { #ifdef CONFIG_IA32_EMULATION extern __u32 ia32_dir_class[]; + extern __u32 ia32_write_class[]; + extern __u32 ia32_read_class[]; extern __u32 ia32_chattr_class[]; + audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); #endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); return 0; -- cgit v1.1 From 55669bfa141b488be865341ed12e188967d11308 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 31 Aug 2006 19:26:40 -0400 Subject: [PATCH] audit: AUDIT_PERM support add support for AUDIT_PERM predicate Signed-off-by: Al Viro --- arch/x86_64/kernel/audit.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c index 36840ac..21f3338 100644 --- a/arch/x86_64/kernel/audit.c +++ b/arch/x86_64/kernel/audit.c @@ -23,6 +23,25 @@ static unsigned chattr_class[] = { ~0U }; +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_IA32_EMULATION + extern int ia32_classify_syscall(unsigned); + if (abi == AUDIT_ARCH_I386) + return ia32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + return 5; + default: + return 0; + } +} + static int __init audit_classes_init(void) { #ifdef CONFIG_IA32_EMULATION -- cgit v1.1