From 1871853f7abc3c727c4346539c5062cbeaf016a4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 01:51:22 +0200 Subject: x86,64: Simplify save_regs() The save_regs function that saves the regs on low level irq entry is complicated because of the fact it changes its stack in the middle and also because it manipulates data allocated in the caller frame and accesses there are directly calculated from callee rsp value with the return address in the middle of the way. This complicates the static stack offsets calculation and require more dynamic ones. It also needs a save/restore of the function's return address. To simplify and optimize this, turn save_regs() into a macro. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel/entry_64.S') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0..b6b2e85 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -297,27 +297,22 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ - .pushsection .kprobes.text, "ax" -ENTRY(save_args) - XCPT_FRAME + .macro SAVE_ARGS_IRQ cld - /* - * start from rbp in pt_regs and jump over - * return address. - */ - movq_cfi rdi, RDI+8-RBP - movq_cfi rsi, RSI+8-RBP - movq_cfi rdx, RDX+8-RBP - movq_cfi rcx, RCX+8-RBP - movq_cfi rax, RAX+8-RBP - movq_cfi r8, R8+8-RBP - movq_cfi r9, R9+8-RBP - movq_cfi r10, R10+8-RBP - movq_cfi r11, R11+8-RBP - - leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ - movq_cfi rbp, 8 /* push %rbp */ - leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, RDI-RBP + movq_cfi rsi, RSI-RBP + movq_cfi rdx, RDX-RBP + movq_cfi rcx, RCX-RBP + movq_cfi rax, RAX-RBP + movq_cfi r8, R8-RBP + movq_cfi r9, R9-RBP + movq_cfi r10, R10-RBP + movq_cfi r11, R11-RBP + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 0 /* push %rbp */ + movq %rsp, %rbp testl $3, CS(%rdi) je 1f SWAPGS @@ -329,19 +324,14 @@ ENTRY(save_args) */ 1: incl PER_CPU_VAR(irq_count) jne 2f - popq_cfi %rax /* move return address... */ mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 pushq_cfi %rbp /* backlink for unwinder */ - pushq_cfi %rax /* ... to the new stack */ /* * We entered an interrupt context - irqs are off: */ 2: TRACE_IRQS_OFF - ret - CFI_ENDPROC -END(save_args) - .popsection + .endm ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 @@ -791,7 +781,7 @@ END(interrupt) /* reserve pt_regs for scratch regs and rbp */ subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP - call save_args + SAVE_ARGS_IRQ PARTIAL_FRAME 0 call \func .endm -- cgit v1.1 From 3b99a3ef55b292180473a221f3d6bc24455f0632 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 02:25:17 +0200 Subject: x86,64: Separate arg1 from rbp handling in SAVE_REGS_IRQ Just for clarity in the code. Have a first block that handles the frame pointer and a separate one that handles pt_regs pointer and its use. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/entry_64.S') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b6b2e85..20dc8e6 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,9 +310,10 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - leaq -RBP(%rsp),%rdi /* arg1 for handler */ movq_cfi rbp, 0 /* push %rbp */ movq %rsp, %rbp + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) je 1f SWAPGS -- cgit v1.1 From 48ffee7d9e6df51b4957bed64115b7beed671374 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:03:44 +0200 Subject: x86: Remove useless unwinder backlink from irq regs saving The unwinder backlink in interrupt entry is very useless. It's actually not part of the stack frame chain and thus is never used. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel/entry_64.S') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 20dc8e6..6131432 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -327,7 +327,6 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - pushq_cfi %rbp /* backlink for unwinder */ /* * We entered an interrupt context - irqs are off: */ -- cgit v1.1 From a2bbe75089d5eb9a3a46d50dd5c215e213790288 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 16:52:45 +0200 Subject: x86: Don't use frame pointer to save old stack on irq entry rbp is used in SAVE_ARGS_IRQ to save the old stack pointer in order to restore it later in ret_from_intr. It is convenient because we save its value in the irq regs and it's easily restored using the leave instruction. However this is a kind of abuse of the frame pointer which role is to help unwinding the kernel by chaining frames together, each node following the return address to the previous frame. But although we are breaking the frame by changing the stack pointer, there is no preceding return address before the new frame. Hence using the frame pointer to link the two stacks breaks the stack unwinders that find a random value instead of a return address here. There is no workaround that can work in every case. We are using the fixup_bp_irq_link() function to dereference that abused frame pointer in the case of non nesting interrupt (which means stack changed). But that doesn't fix the case of interrupts that don't change the stack (but we still have the unconditional frame link), which is the case of hardirq interrupting softirq. We have no way to detect this transition so the frame irq link is considered as a real frame pointer and the return address is dereferenced but it is still a spurious one. There are two possible results of this: either the spurious return address, a random stack value, luckily belongs to the kernel text and then the unwinding can continue and we just have a weird entry in the stack trace. Or it doesn't belong to the kernel text and unwinding stops there. This is the reason why stacktraces (including perf callchains) on irqs that interrupted softirqs don't work very well. To solve this, we don't save the old stack pointer on rbp anymore but we save it to a scratch register that we push on the new stack and that we pop back later on irq return. This preserves the whole frame chain without spurious return addresses in the middle and drops the need for the horrid fixup_bp_irq_link() workaround. And finally irqs that interrupt softirq are sanely unwinded. Before: 99.81% perf [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--41.60%-- __read | | | |--99.90%-- create_worker | | bench_sched_messaging | | cmd_bench | | run_builtin | | main | | __libc_start_main | --0.10%-- [...] After: 1.64% swapper [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--95.00%-- arch_irq_work_raise | irq_work_queue | __perf_event_overflow | perf_swevent_overflow | perf_swevent_event | perf_tp_event | perf_trace_softirq | __do_softirq | call_softirq | do_softirq | irq_exit | | | |--73.68%-- smp_apic_timer_interrupt | | apic_timer_interrupt | | | | | |--96.43%-- amd_e400_idle | | | cpu_idle | | | start_secondary Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel/entry_64.S') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6131432..d656f68 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,8 +310,11 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - movq_cfi rbp, 0 /* push %rbp */ - movq %rsp, %rbp + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) @@ -327,10 +330,11 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - /* - * We entered an interrupt context - irqs are off: - */ -2: TRACE_IRQS_OFF + +2: /* Store previous stack value */ + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF .endm ENTRY(save_rest) @@ -804,15 +808,14 @@ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) - leaveq - CFI_RESTORE rbp + /* Restore saved previous stack */ + popq %rsi + leaq 16(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -16 - /* we did not save rbx, restore only from ARGOFFSET */ - addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) -- cgit v1.1