From 905a36a2851838bca5a424fb758e201990234e6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 13:37:36 +0200 Subject: x86/asm/entry: Move entry_64.S and entry_32.S to arch/x86/entry/ Create a new directory hierarchy for the low level x86 entry code: arch/x86/entry/* This will host all the low level glue that is currently scattered all across arch/x86/. Start with entry_64.S and entry_32.S. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 4 + arch/x86/entry/entry_32.S | 1249 +++++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_64.S | 1442 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2695 insertions(+) create mode 100644 arch/x86/entry/Makefile create mode 100644 arch/x86/entry/entry_32.S create mode 100644 arch/x86/entry/entry_64.S (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile new file mode 100644 index 0000000..fa7e0cf --- /dev/null +++ b/arch/x86/entry/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for the x86 low level entry code +# +obj-y := entry_$(BITS).o diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S new file mode 100644 index 0000000..0ac73de --- /dev/null +++ b/arch/x86/entry/entry_32.S @@ -0,0 +1,1249 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs +.endm + +.macro POP_GS pop=0 +98: popl %gs + .if \pop <> 0 + add $\pop, %esp + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro GS_TO_REG reg + movl %gs, \reg +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds +2: popl %es +3: popl %fs + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.popsection + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,5b) + _ASM_EXTABLE(3b,6b) + POP_GS_EX +.endm + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + jmp syscall_exit +END(ret_from_fork) + +ENTRY(ret_from_kernel_thread) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + movl PT_EBP(%esp),%eax + call *PT_EBX(%esp) + movl $0,PT_EAX(%esp) + jmp syscall_exit +ENDPROC(ret_from_kernel_thread) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) +need_resched: + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(ia32_sysenter_target) + movl TSS_sysenter_sp0(%esp),%esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl $__USER_DS + pushl %ebp + pushfl + orl $X86_EFLAGS_IF, (%esp) + pushl $__USER_CS + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. + */ + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) + + pushl %eax + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault + ASM_STAC +1: movl (%ebp),%ebp + ASM_CLAC + movl %ebp,PT_EBP(%esp) + _ASM_EXTABLE(1b,syscall_fault) + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(,%eax,4) +sysenter_after_call: + movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jnz sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ + call __audit_syscall_entry + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + call __audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) + PTGS_TO_GS_EX +ENDPROC(ia32_sysenter_target) + + # system call handler stub +ENTRY(system_call) + ASM_CLAC + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) +syscall_after_call: + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jnz syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + je ldt_ss # returning to user-space with LDT SS +#endif +restore_nocheck: + RESTORE_REGS 4 # skip orig_eax/error_code +irq_return: + INTERRUPT_RETURN +.section .fixup,"ax" +ENTRY(iret_exc) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous + _ASM_EXTABLE(irq_return,iret_exc) + +#ifdef CONFIG_X86_ESPFIX32 +ldt_ss: +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + jmp restore_nocheck +#endif +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jnz work_notifysig_v86 # returning to kernel-space or + # vm86-space +1: +#else + movl %esp, %eax +#endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace + +#ifdef CONFIG_VM86 + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + jmp 1b +#endif +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + +syscall_fault: + ASM_CLAC + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call +END(sysenter_badsys) + +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call fn; \ + jmp ret_from_intr; \ +ENDPROC(name) + + +#ifdef CONFIG_TRACING +#define TRACE_BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +#define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) + +/* The include is where all of the SMP etc. interrupts come from */ +#include + +ENTRY(coprocessor_error) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_error + jmp error_code +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + ASM_CLAC + pushl $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM +#else + pushl $do_simd_coprocessor_error +#endif + jmp error_code +END(simd_coprocessor_error) + +ENTRY(device_not_available) + ASM_CLAC + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp error_code +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret + _ASM_EXTABLE(native_iret, iret_exc) +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + ASM_CLAC + pushl $0 + pushl $do_overflow + jmp error_code +END(overflow) + +ENTRY(bounds) + ASM_CLAC + pushl $0 + pushl $do_bounds + jmp error_code +END(bounds) + +ENTRY(invalid_op) + ASM_CLAC + pushl $0 + pushl $do_invalid_op + jmp error_code +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + ASM_CLAC + pushl $do_invalid_TSS + jmp error_code +END(invalid_TSS) + +ENTRY(segment_not_present) + ASM_CLAC + pushl $do_segment_not_present + jmp error_code +END(segment_not_present) + +ENTRY(stack_segment) + ASM_CLAC + pushl $do_stack_segment + jmp error_code +END(stack_segment) + +ENTRY(alignment_check) + ASM_CLAC + pushl $do_alignment_check + jmp error_code +END(alignment_check) + +ENTRY(divide_error) + ASM_CLAC + pushl $0 # no error code + pushl $do_divide_error + jmp error_code +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + ASM_CLAC + pushl $0 + pushl machine_check_vector + jmp error_code +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + ASM_CLAC + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code +END(spurious_interrupt_bug) + +#ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + +ENTRY(xen_hypervisor_callback) + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + jmp xen_iret_crit_fixup + +ENTRY(xen_do_upcall) +1: mov %esp, %eax + call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp ret_from_intr +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ + testl %eax,%eax + popl %eax + lea 16(%esp),%esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp ret_from_exception + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous + _ASM_EXTABLE(1b,6b) + _ASM_EXTABLE(2b,7b) + _ASM_EXTABLE(3b,8b) + _ASM_EXTABLE(4b,9b) +ENDPROC(xen_failsafe_callback) + +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4,%esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +ftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS,13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + + popf + jmp ftrace_stub +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif + +#ifdef CONFIG_TRACING +ENTRY(trace_page_fault) + ASM_CLAC + pushl $trace_do_page_fault + jmp error_code +END(trace_page_fault) +#endif + +ENTRY(page_fault) + ASM_CLAC + pushl $do_page_fault + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp +.endm + +ENTRY(debug) + ASM_CLAC + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_espfix_stack +#endif + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + +nmi_stack_fixup: + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +#ifdef CONFIG_X86_ESPFIX32 +nmi_espfix_stack: + /* + * create the pointer to lss back + */ + pushl %ss + pushl %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + .endr + pushl %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + jmp irq_return +#endif +END(nmi) + +ENTRY(int3) + ASM_CLAC + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception +END(int3) + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + ASM_CLAC + pushl $do_async_page_fault + jmp error_code +END(async_page_fault) +#endif + diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S new file mode 100644 index 0000000..4ad79e9 --- /dev/null +++ b/arch/x86/entry/entry_64.S @@ -0,0 +1,1442 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * Some of this is documented in Documentation/x86/entry_64.txt + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * A note on terminology: + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * + * Some macro usage: + * - ENTRY/END Define functions in the symbol table. + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - idtentry - Define exception entry points. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + + .code64 + .section .entry.text, "ax" + + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret64) + swapgs + sysretq +ENDPROC(native_usergs_sysret64) +#endif /* CONFIG_PARAVIRT */ + + +.macro TRACE_IRQS_IRETQ +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG + bt $9,EFLAGS(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + +/* + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. + * + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Registers on entry: + * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) + * rdi arg0 + * rsi arg1 + * rdx arg2 + * r10 arg3 (needs to be moved to rcx to conform to C ABI) + * r8 arg4 + * r9 arg5 + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) + * + * Only called from user space. + * + * When user can change pt_regs->foo always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs + * for the guest and jump here on syscall. + */ +GLOBAL(system_call_after_swapgs) + + movq %rsp,PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + + /* Construct struct pt_regs on stack */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + /* + * Re-enable interrupts. + * We use 'rsp_scratch' as a scratch space, hence irq-off block above + * must execute atomically in the face of possible interrupt-driven + * task preemption. We must enable interrupts only after we're done + * with using rsp_scratch: + */ + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz tracesys +system_call_fastpath: +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10,%rcx + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: +/* + * Syscall return path ending with SYSRET (fast path). + * Has incompletely filled pt_regs. + */ + LOCKDEP_SYS_EXIT + /* + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + DISABLE_INTERRUPTS(CLBR_NONE) + + /* + * We must check ti flags with interrupts (or at least preemption) + * off because we must *never* return to userspace without + * processing exit work that is enqueued if we're preempted here. + * In particular, returning to userspace with any of the one-shot + * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is + * very bad. + */ + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RIP(%rsp),%rcx + movq EFLAGS(%rsp),%r11 + movq RSP(%rsp),%rsp + /* + * 64bit SYSRET restores rip from rcx, + * rflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * Restoration of rflags re-enables interrupts. + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we should + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. (Actually + * detecting the failure in 64-bit userspace is tricky but can be + * done.) + */ + USERGS_SYSRET64 + + /* Do syscall entry tracing */ +tracesys: + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax + jmp system_call_fastpath /* and return to the fast path */ + +tracesys_phase2: + SAVE_EXTRA_REGS + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + movq %rax,%rdx + call syscall_trace_enter_phase2 + + /* + * Reload registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_entry_phase2() returned + * the value it wants us to use in the table lookup. + */ + RESTORE_C_REGS_EXCEPT_RAX + RESTORE_EXTRA_REGS +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + /* Use IRET because user could have changed pt_regs->foo */ + +/* + * Syscall return path ending with IRET. + * Has correct iret frame. + */ +GLOBAL(int_ret_from_sys_call) + DISABLE_INTERRUPTS(CLBR_NONE) +int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ + TRACE_IRQS_OFF + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +GLOBAL(int_with_check) + LOCKDEP_SYS_EXIT_IRQ + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,TI_status(%rcx) + jmp syscall_return + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + SCHEDULE_USER + popq %rdi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full pt_regs */ +int_very_careful: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_EXTRA_REGS + /* Check for syscall exit trace */ + testl $_TIF_WORK_SYSCALL_EXIT,%edx + jz int_signal + pushq %rdi + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi + jmp int_restore_rest + +int_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_WORK_MASK,%edi +int_restore_rest: + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + +syscall_return: + /* The IRETQ could re-enable interrupts: */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq RCX(%rsp),%rcx + movq RIP(%rsp),%r11 + cmpq %rcx,%r11 /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * If width of "canonical tail" ever becomes variable, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- SYSRET checks need update" + .endif + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed + + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + /* + * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. This would cause an infinite loop whenever #DB happens + * with register state that satisfies the opportunistic SYSRET + * conditions. For example, single-stepping this user code: + * + * movq $stuck_here,%rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp),%rsp + USERGS_SYSRET64 + +opportunistic_sysret_failed: + SWAPGS + jmp restore_c_regs_and_iret +END(system_call) + + + .macro FORK_LIKE func +ENTRY(stub_\func) + SAVE_EXTRA_REGS 8 + jmp sys_\func +END(stub_\func) + .endm + + FORK_LIKE clone + FORK_LIKE fork + FORK_LIKE vfork + +ENTRY(stub_execve) + call sys_execve +return_from_execve: + testl %eax, %eax + jz 1f + /* exec failed, can use fast SYSRET code path in this case */ + ret +1: + /* must use IRET code path (pt_regs->cs may have changed) */ + addq $8, %rsp + ZERO_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call +END(stub_execve) +/* + * Remaining execve stubs are only 7 bytes long. + * ENTRY() often aligns to 16 bytes, which in this case has no benefits. + */ + .align 8 +GLOBAL(stub_execveat) + call sys_execveat + jmp return_from_execve +END(stub_execveat) + +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) + .align 8 +GLOBAL(stub_x32_execve) +GLOBAL(stub32_execve) + call compat_sys_execve + jmp return_from_execve +END(stub32_execve) +END(stub_x32_execve) + .align 8 +GLOBAL(stub_x32_execveat) +GLOBAL(stub32_execveat) + call compat_sys_execveat + jmp return_from_execve +END(stub32_execveat) +END(stub_x32_execveat) +#endif + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + /* + * SAVE_EXTRA_REGS result is not normally needed: + * sigreturn overwrites all pt_regs->GPREGS. + * But sigreturn can fail (!), and there is no easy way to detect that. + * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, + * we SAVE_EXTRA_REGS here. + */ + SAVE_EXTRA_REGS 8 + call sys_rt_sigreturn +return_from_stub: + addq $8, %rsp + RESTORE_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call +END(stub_rt_sigreturn) + +#ifdef CONFIG_X86_X32_ABI +ENTRY(stub_x32_rt_sigreturn) + SAVE_EXTRA_REGS 8 + call sys32_x32_rt_sigreturn + jmp return_from_stub +END(stub_x32_rt_sigreturn) +#endif + +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq $0x0002 + popfq # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + RESTORE_EXTRA_REGS + + testb $3, CS(%rsp) # from kernel_thread? + + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use IRET code path to return, since it can safely handle + * all of the above. + */ + jnz int_ret_from_sys_call + + /* We came from kernel_thread */ + /* nb: we depend on RESTORE_EXTRA_REGS above */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_EXTRA_REGS + jmp int_ret_from_sys_call +END(ret_from_fork) + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushq $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * Interrupt entry/exit. + * + * Interrupt entry points save only callee clobbered registers in fast path. + * + * Entry runs with interrupts off. + */ + +/* 0(%rsp): ~(interrupt number) */ + .macro interrupt func + cld + /* + * Since nothing in interrupt handling code touches r12...r15 members + * of "struct pt_regs", and since interrupts can nest, we can save + * four stack slots and simultaneously provide + * an unwind-friendly stack layout by saving "truncated" pt_regs + * exactly up to rbp slot, without these members. + */ + ALLOC_PT_GPREGS_ON_STACK -RBP + SAVE_C_REGS -RBP + /* this goes to 0(%rsp) for unwinder, not for saving the value: */ + SAVE_EXTRA_REGS_RBP -RBP + + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ + + testb $3, CS-RBP(%rsp) + jz 1f + SWAPGS +1: + /* + * Save previous stack pointer, optionally switch to interrupt stack. + * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + + call \func + .endm + + /* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_interrupt. + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + interrupt do_IRQ + /* 0(%rsp): old RSP */ +ret_from_intr: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + decl PER_CPU_VAR(irq_count) + + /* Restore saved previous stack */ + popq %rsi + /* return code expects complete pt_regs - adjust rsp accordingly: */ + leaq -RBP(%rsi),%rsp + + testb $3, CS(%rsp) + jz retint_kernel + /* Interrupt came from user space */ +retint_user: + GET_THREAD_INFO(%rcx) + /* + * %rcx: thread info. Interrupts off. + */ +retint_with_reschedule: + movl $_TIF_WORK_MASK,%edi +retint_check: + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz retint_careful + +retint_swapgs: /* return to user-space */ + /* + * The iretq could re-enable interrupts: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + SWAPGS + jmp restore_c_regs_and_iret + +/* Returning to kernel space */ +retint_kernel: +#ifdef CONFIG_PREEMPT + /* Interrupts are off */ + /* Check if we need preemption */ + bt $9,EFLAGS(%rsp) /* interrupts were off? */ + jnc 1f +0: cmpl $0,PER_CPU_VAR(__preempt_count) + jnz 1f + call preempt_schedule_irq + jmp 0b +1: +#endif + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ + +/* + * At this label, code paths which return to kernel and to user, + * which come from interrupts/exception and from syscalls, merge. + */ +restore_c_regs_and_iret: + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + +irq_return: + INTERRUPT_RETURN + +ENTRY(native_iret) + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4,(SS-RIP)(%rsp) + jnz native_irq_return_ldt +#endif + +.global native_irq_return_iret +native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ + iretq + +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: + pushq %rax + pushq %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq %rax + jmp native_irq_return_iret +#endif + + /* edi: workmask, edx: work */ +retint_careful: + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + SCHEDULE_USER + popq %rdi + GET_THREAD_INFO(%rcx) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz retint_swapgs + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_EXTRA_REGS + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_with_reschedule + +END(common_interrupt) + +/* + * APIC interrupts. + */ +.macro apicinterrupt3 num sym do_sym +ENTRY(\sym) + ASM_CLAC + pushq $~(\num) +.Lcommon_\sym: + interrupt \do_sym + jmp ret_from_intr +END(\sym) +.endm + +#ifdef CONFIG_TRACING +#define trace(sym) trace_##sym +#define smp_trace(sym) smp_trace_##sym + +.macro trace_apicinterrupt num sym +apicinterrupt3 \num trace(\sym) smp_trace(\sym) +.endm +#else +.macro trace_apicinterrupt num sym do_sym +.endm +#endif + +.macro apicinterrupt num sym do_sym +apicinterrupt3 \num \sym \do_sym +trace_apicinterrupt \num \sym +.endm + +#ifdef CONFIG_SMP +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt +#endif + +#ifdef CONFIG_X86_UV +apicinterrupt3 UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +#endif +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi + +#ifdef CONFIG_HAVE_KVM +apicinterrupt3 POSTED_INTR_VECTOR \ + kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt +#endif + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif + +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt +#endif + +/* + * Exception entry points. + */ +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) + +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +ENTRY(\sym) + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif + + ASM_CLAC + PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + + ALLOC_PT_GPREGS_ON_STACK + + .if \paranoid + .if \paranoid == 1 + testb $3, CS(%rsp) /* If coming from userspace, switch */ + jnz 1f /* stacks. */ + .endif + call paranoid_entry + .else + call error_entry + .endif + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + /* these procedures expect "no swapgs" flag in ebx */ + .if \paranoid + jmp paranoid_exit + .else + jmp error_exit + .endif + + .if \paranoid == 1 + /* + * Paranoid entry from userspace. Switch stacks and treat it + * as a normal entry. This means that paranoid handlers + * run in real process context if user_mode(regs). + */ +1: + call error_entry + + + movq %rsp,%rdi /* pt_regs pointer */ + call sync_regs + movq %rax,%rsp /* switch stack */ + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + call \do_sym + + jmp error_exit /* %ebx: no swapgs flag */ + .endif +END(\sym) +.endm + +#ifdef CONFIG_TRACING +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#else +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#endif + +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* Reload gs selector with exception handling */ + /* edi: new selector */ +ENTRY(native_load_gs_index) + pushfq + DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + SWAPGS +gs_change: + movl %edi,%gs +2: mfence /* workaround */ + SWAPGS + popfq + ret +END(native_load_gs_index) + + _ASM_EXTABLE(gs_change,bad_gs) + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: + SWAPGS /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(do_softirq_own_stack) + pushq %rbp + mov %rsp,%rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr),%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + decl PER_CPU_VAR(irq_count) + ret +END(do_softirq_own_stack) + +#ifdef CONFIG_XEN +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 + +/* + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ + movq %rdi, %rsp # we don't return, adjust the stack frame +11: incl PER_CPU_VAR(irq_count) + movq %rsp,%rbp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + decl PER_CPU_VAR(irq_count) +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp error_exit +END(xen_do_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ +ENTRY(xen_failsafe_callback) + movl %ds,%ecx + cmpw %cx,0x10(%rsp) + jne 1f + movl %es,%ecx + cmpw %cx,0x18(%rsp) + jne 1f + movl %fs,%ecx + cmpw %cx,0x20(%rsp) + jne 1f + movl %gs,%ecx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x30,%rsp + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx + jmp general_protection +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x30,%rsp + pushq $-1 /* orig_ax = -1 => not a system call */ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS + jmp error_exit +END(xen_failsafe_callback) + +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + xen_hvm_callback_vector xen_evtchn_do_upcall + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + hyperv_callback_vector hyperv_vector_handler +#endif /* CONFIG_HYPERV */ + +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 +#ifdef CONFIG_XEN +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 +#endif +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 +#ifdef CONFIG_KVM_GUEST +idtentry async_page_fault do_async_page_fault has_error_code=1 +#endif +#ifdef CONFIG_X86_MCE +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +#endif + +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret +END(paranoid_entry) + +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF_DEBUG + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs + TRACE_IRQS_IRETQ + SWAPGS_UNSAFE_STACK + jmp paranoid_exit_restore +paranoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN +END(paranoid_exit) + +/* + * Save all registers in pt_regs, and switch gs if needed. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(error_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + xorl %ebx,%ebx + testb $3, CS+8(%rsp) + jz error_kernelspace +error_swapgs: + SWAPGS +error_sti: + TRACE_IRQS_OFF + ret + + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ +error_kernelspace: + incl %ebx + leaq native_irq_return_iret(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_bad_iret + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti +END(error_entry) + + +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +ENTRY(error_exit) + movl %ebx,%eax + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %eax,%eax + jnz retint_kernel + jmp retint_user +END(error_exit) + +/* Runs on exception stack */ +ENTRY(nmi) + PARAVIRT_ADJUST_EXCEPTION_FRAME + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as our temp variable throughout */ + pushq %rdx + + /* + * If %cs was not the kernel segment, then the NMI triggered in user + * space, which means it is definitely not nested. + */ + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmpl $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ + cmpq %rdx, 4*8(%rsp) + /* If the stack pointer is above the NMI stack, this is a normal NMI */ + ja first_nmi + subq $EXCEPTION_STKSZ, %rdx + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + /* Ah, it is within the NMI stack, treat it as nested */ + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi + + /* Put stack back */ + addq $(6*8), %rsp + +nested_nmi_out: + popq %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ + movq (%rsp), %rdx + + /* Set the NMI executing variable on the stack. */ + pushq $1 + + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq 11*8(%rsp) + .endr + + /* Everything up to here is safe from nested NMIs */ + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 10*8(%rsp) + + /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + .rept 5 + pushq -6*8(%rsp) + .endr + subq $(5*8), %rsp +end_repeat_nmi: + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. + */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ + ALLOC_PT_GPREGS_ON_STACK + + /* + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ + call paranoid_entry + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + /* Pop the extra iret frame at once */ + REMOVE_PT_GPREGS_FROM_STACK 6*8 + + /* Clear the NMI executing stack variable */ + movq $0, 5*8(%rsp) + jmp irq_return +END(nmi) + +ENTRY(ignore_sysret) + mov $-ENOSYS,%eax + sysret +END(ignore_sysret) + -- cgit v1.1 From 19a433f45135656d065bdf1bbe543796b194ba3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:00:59 +0200 Subject: x86/asm/entry: Move the compat syscall entry code to arch/x86/entry/ Move the ia32entry.S file over into arch/x86/entry/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 5 +- arch/x86/entry/ia32entry.S | 522 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 526 insertions(+), 1 deletion(-) create mode 100644 arch/x86/entry/ia32entry.S (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index fa7e0cf..4a62659 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,4 +1,7 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o +obj-y := entry_$(BITS).o + +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o + diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S new file mode 100644 index 0000000..2be23c7 --- /dev/null +++ b/arch/x86/entry/ia32entry.S @@ -0,0 +1,522 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysexit_audit ia32_ret_from_sys_call +#define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + + /* clobbers %rax */ + .macro CLEAR_RREGS _r9=rax + xorl %eax,%eax + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %\_r9,R9(%rsp) + movq %rax,R8(%rsp) + .endm + + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + .macro LOAD_ARGS32 _r9=0 + .if \_r9 + movl R9(%rsp),%r9d + .endif + movl RCX(%rsp),%ecx + movl RDX(%rsp),%edx + movl RSI(%rsp),%esi + movl RDI(%rsp),%edi + movl %eax,%eax /* zero extension */ + .endm + + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%rbp),%ebp + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT,EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp),%ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx,%edx /* avoid info leaks */ + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + movl EFLAGS(%rsp),%r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp),%esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r8d /* 5th arg: 4th syscall arg */ + movl %ecx,%r9d /*swap with edx*/ + movl %edx,%ecx /* 4th arg: 3rd syscall arg */ + movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ + movl %ebx,%esi /* 2nd arg: 1st syscall arg */ + movl %eax,%edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp),%eax /* reload syscall number */ + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI(%rsp),%r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax,%esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp),%rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + CLEAR_RREGS + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + CLEAR_RREGS + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(ia32_sysenter_target) + +/* + * 32bit SYSCALL instruction entry. + * + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp,%r8d + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp,%ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + /* + * no need to do an access_ok check here because r8 has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%r8),%r9d + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + /* r9 already loaded */ /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp),%ecx + movl EFLAGS(%rsp),%r11d + xorq %r10,%r10 + xorq %r9,%r9 + xorq %r8,%r8 + TRACE_IRQS_ON + movl RSP(%rsp),%esp + /* + * 64bit->32bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32bit->32bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + movl %r9d,R9(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9(%rsp),%r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + xchgl %r9d,%ebp + SAVE_EXTRA_REGS + CLEAR_RREGS r9 + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + xchgl %ebp,%r9d + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT,%rax + jmp ia32_sysret + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys +ia32_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX(%rsp) +1: +ia32_ret_from_sys_call: + CLEAR_RREGS + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + CLEAR_RREGS + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(ia32_syscall) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip),%rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip),%rax + mov %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) -- cgit v1.1 From d603c8e184d882cc3f55c599f4185bad956ee0a7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:05:44 +0200 Subject: x86/asm/entry, x86/vdso: Move the vDSO code to arch/x86/entry/vdso/ Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 2 + arch/x86/entry/vdso/.gitignore | 7 + arch/x86/entry/vdso/Makefile | 209 +++++++++++++++ arch/x86/entry/vdso/checkundef.sh | 10 + arch/x86/entry/vdso/vclock_gettime.c | 351 +++++++++++++++++++++++++ arch/x86/entry/vdso/vdso-layout.lds.S | 118 +++++++++ arch/x86/entry/vdso/vdso-note.S | 12 + arch/x86/entry/vdso/vdso.lds.S | 29 ++ arch/x86/entry/vdso/vdso2c.c | 253 ++++++++++++++++++ arch/x86/entry/vdso/vdso2c.h | 175 ++++++++++++ arch/x86/entry/vdso/vdso32-setup.c | 120 +++++++++ arch/x86/entry/vdso/vdso32/.gitignore | 1 + arch/x86/entry/vdso/vdso32/int80.S | 56 ++++ arch/x86/entry/vdso/vdso32/note.S | 44 ++++ arch/x86/entry/vdso/vdso32/sigreturn.S | 145 ++++++++++ arch/x86/entry/vdso/vdso32/syscall.S | 75 ++++++ arch/x86/entry/vdso/vdso32/sysenter.S | 116 ++++++++ arch/x86/entry/vdso/vdso32/vclock_gettime.c | 30 +++ arch/x86/entry/vdso/vdso32/vdso-fakesections.c | 1 + arch/x86/entry/vdso/vdso32/vdso32.lds.S | 37 +++ arch/x86/entry/vdso/vdsox32.lds.S | 25 ++ arch/x86/entry/vdso/vgetcpu.c | 28 ++ arch/x86/entry/vdso/vma.c | 300 +++++++++++++++++++++ 23 files changed, 2144 insertions(+) create mode 100644 arch/x86/entry/vdso/.gitignore create mode 100644 arch/x86/entry/vdso/Makefile create mode 100755 arch/x86/entry/vdso/checkundef.sh create mode 100644 arch/x86/entry/vdso/vclock_gettime.c create mode 100644 arch/x86/entry/vdso/vdso-layout.lds.S create mode 100644 arch/x86/entry/vdso/vdso-note.S create mode 100644 arch/x86/entry/vdso/vdso.lds.S create mode 100644 arch/x86/entry/vdso/vdso2c.c create mode 100644 arch/x86/entry/vdso/vdso2c.h create mode 100644 arch/x86/entry/vdso/vdso32-setup.c create mode 100644 arch/x86/entry/vdso/vdso32/.gitignore create mode 100644 arch/x86/entry/vdso/vdso32/int80.S create mode 100644 arch/x86/entry/vdso/vdso32/note.S create mode 100644 arch/x86/entry/vdso/vdso32/sigreturn.S create mode 100644 arch/x86/entry/vdso/vdso32/syscall.S create mode 100644 arch/x86/entry/vdso/vdso32/sysenter.S create mode 100644 arch/x86/entry/vdso/vdso32/vclock_gettime.c create mode 100644 arch/x86/entry/vdso/vdso32/vdso-fakesections.c create mode 100644 arch/x86/entry/vdso/vdso32/vdso32.lds.S create mode 100644 arch/x86/entry/vdso/vdsox32.lds.S create mode 100644 arch/x86/entry/vdso/vgetcpu.c create mode 100644 arch/x86/entry/vdso/vma.c (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 4a62659..c975324 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -3,5 +3,7 @@ # obj-y := entry_$(BITS).o +obj-y += vdso/ + obj-$(CONFIG_IA32_EMULATION) += ia32entry.o diff --git a/arch/x86/entry/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore new file mode 100644 index 0000000..aae8ffd --- /dev/null +++ b/arch/x86/entry/vdso/.gitignore @@ -0,0 +1,7 @@ +vdso.lds +vdsox32.lds +vdso32-syscall-syms.lds +vdso32-sysenter-syms.lds +vdso32-int80-syms.lds +vdso-image-*.c +vdso2c diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile new file mode 100644 index 0000000..e970320 --- /dev/null +++ b/arch/x86/entry/vdso/Makefile @@ -0,0 +1,209 @@ +# +# Building vDSO images for x86. +# + +KBUILD_CFLAGS += $(DISABLE_LTO) +KASAN_SANITIZE := n + +VDSO64-$(CONFIG_X86_64) := y +VDSOX32-$(CONFIG_X86_X32_ABI) := y +VDSO32-$(CONFIG_X86_32) := y +VDSO32-$(CONFIG_COMPAT) := y + +# files to link into the vdso +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o + +# files to link into kernel +obj-y += vma.o + +# vDSO images to build +vdso_img-$(VDSO64-y) += 64 +vdso_img-$(VDSOX32-y) += x32 +vdso_img-$(VDSO32-y) += 32-int80 +vdso_img-$(CONFIG_COMPAT) += 32-syscall +vdso_img-$(VDSO32-y) += 32-sysenter + +obj-$(VDSO32-y) += vdso32-setup.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.lds $(vobjs-y) + +# Build the vDSO image C files and link them in. +vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) +vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) +vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) +obj-y += $(vdso_img_objs) +targets += $(vdso_img_cfiles) +targets += $(vdso_img_sodbg) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ + $(vdso_img-y:%=$(obj)/vdso%.so) + +export CPPFLAGS_vdso.lds += -P -C + +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,--no-undefined \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ + $(DISABLE_LTO) + +$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,vdso) + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi +hostprogs-y += vdso2c + +quiet_cmd_vdso2c = VDSO2C $@ +define cmd_vdso2c + $(obj)/vdso2c $< $(<:%.dbg=%) $@ +endef + +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE + $(call if_changed,vdso2c) + +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# +CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -DDISABLE_BRANCH_PROFILING + +$(vobjs): KBUILD_CFLAGS += $(CFL) + +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vvar.o = -pg + +# +# X32 processes use x32 vDSO to access 64bit kernel data. +# +# Build x32 vDSO image: +# 1. Compile x32 vDSO as 64bit. +# 2. Convert object files to x32. +# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes +# so that it can reach 64bit address space with 64bit pointers. +# + +CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ + -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 \ + -Wl,-z,common-page-size=4096 + +# 64-bit objects to re-brand as x32 +vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y)) + +# x32-rebranded versions +vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o) + +# same thing, but in the output directory +vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) + +# Convert 64bit object file to x32 for x32 vDSO. +quiet_cmd_x32 = X32 $@ + cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@ + +$(obj)/%-x32.o: $(obj)/%.o FORCE + $(call if_changed,x32) + +targets += vdsox32.lds $(vobjx32s-y) + +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg + $(call if_changed,objcopy) + +$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE + $(call if_changed,vdso) + +# +# Build multiple 32-bit vDSO images to choose from at boot time. +# +vdso32.so-$(VDSO32-y) += int80 +vdso32.so-$(CONFIG_COMPAT) += syscall +vdso32.so-$(VDSO32-y) += sysenter + +vdso32-images = $(vdso32.so-y:%=vdso32-%.so) + +CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 + +# This makes sure the $(obj) subdirectory exists even though vdso32/ +# is not a kbuild sub-make subdirectory. +override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ + +targets += vdso32/vdso32.lds +targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/vclock_gettime.o + +$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic +KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) +KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + +$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/%.o + $(call if_changed,vdso) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ + $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) +GCOV_PROFILE := n + +# +# Install the unstripped copies of vdso*.so. If our toolchain supports +# build-id, install .build-id links as well. +# +quiet_cmd_vdso_install = INSTALL $(@:install_%=%) +define cmd_vdso_install + cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ + if readelf -n $< |grep -q 'Build ID'; then \ + buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ + first=`echo $$buildid | cut -b-2`; \ + last=`echo $$buildid | cut -b3-`; \ + mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ + ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ + fi +endef + +vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) + +$(MODLIB)/vdso: FORCE + @mkdir -p $(MODLIB)/vdso + +$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE + $(call cmd,vdso_install) + +PHONY += vdso_install $(vdso_img_insttargets) +vdso_install: $(vdso_img_insttargets) FORCE + +clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh new file mode 100755 index 0000000..7ee90a9 --- /dev/null +++ b/arch/x86/entry/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +$nm "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c new file mode 100644 index 0000000..9793322 --- /dev/null +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -0,0 +1,351 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime, gettimeofday, and time. + * + * 32 Bit compat layer by Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define gtod (&VVAR(vsyscall_gtod_data)) + +extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); +extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern time_t __vdso_time(time_t *t); + +#ifdef CONFIG_HPET_TIMER +extern u8 hpet_page + __attribute__((visibility("hidden"))); + +static notrace cycle_t vread_hpet(void) +{ + return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); +} +#endif + +#ifndef BUILD_VDSO32 + +#include +#include +#include +#include + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} + +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +{ + const struct pvclock_vsyscall_time_info *pvti_base; + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); + + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); + + pvti_base = (struct pvclock_vsyscall_time_info *) + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); + + return &pvti_base[offset]; +} + +static notrace cycle_t vread_pvclock(int *mode) +{ + const struct pvclock_vsyscall_time_info *pvti; + cycle_t ret; + u64 last; + u32 version; + u8 flags; + unsigned cpu, cpu1; + + + /* + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; + /* TODO: We can put vcpu id into higher bits of pvti.version. + * This will save a couple of cycles by getting rid of + * __getcpu() calls (Gleb). + */ + + pvti = get_pvti(cpu); + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. + */ + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; + + /* refer to tsc.c read_tsc() comment for rationale */ + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif + +#else + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "memory", "edx"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "memory", "edx"); + return ret; +} + +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace cycle_t vread_pvclock(int *mode) +{ + *mode = VCLOCK_NONE; + return 0; +} +#endif + +#endif + +notrace static cycle_t vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)__native_read_tsc(); + + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +notrace static inline u64 vgetsns(int *mode) +{ + u64 v; + cycles_t cycles; + + if (gtod->vclock_mode == VCLOCK_TSC) + cycles = vread_tsc(); +#ifdef CONFIG_HPET_TIMER + else if (gtod->vclock_mode == VCLOCK_HPET) + cycles = vread_hpet(); +#endif +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif + else + return 0; + v = (cycles - gtod->cycle_last) & gtod->mask; + return v * gtod->mult; +} + +/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ +notrace static int __always_inline do_realtime(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->wall_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static int __always_inline do_monotonic(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static void do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->wall_time_coarse_sec; + ts->tv_nsec = gtod->wall_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace static void do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->monotonic_time_coarse_sec; + ts->tv_nsec = gtod->monotonic_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + switch (clock) { + case CLOCK_REALTIME: + if (do_realtime(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_MONOTONIC: + if (do_monotonic(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_REALTIME_COARSE: + do_realtime_coarse(ts); + break; + case CLOCK_MONOTONIC_COARSE: + do_monotonic_coarse(ts); + break; + default: + goto fallback; + } + + return 0; +fallback: + return vdso_fallback_gettime(clock, ts); +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + if (likely(tv != NULL)) { + if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) + return vdso_fallback_gtod(tv, tz); + tv->tv_usec /= 1000; + } + if (unlikely(tz != NULL)) { + tz->tz_minuteswest = gtod->tz_minuteswest; + tz->tz_dsttime = gtod->tz_dsttime; + } + + return 0; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); + +/* + * This will break when the xtime seconds get inaccurate, but that is + * unlikely + */ +notrace time_t __vdso_time(time_t *t) +{ + /* This is atomic on x86 so we don't need any locks. */ + time_t result = ACCESS_ONCE(gtod->wall_time_sec); + + if (t) + *t = result; + return result; +} +int time(time_t *t) + __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S new file mode 100644 index 0000000..de2c921 --- /dev/null +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -0,0 +1,118 @@ +#include + +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +#if defined(BUILD_VDSO64) +# define SHDR_SIZE 64 +#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) +# define SHDR_SIZE 40 +#else +# error unknown VDSO target +#endif + +#define NUM_FAKE_SHDRS 13 + +SECTIONS +{ + /* + * User/kernel shared data is before the vDSO. This may be a little + * uglier than putting it after the vDSO, but it avoids issues with + * non-allocatable things that dangle past the end of the PT_LOAD + * segment. + */ + + vvar_start = . - 2 * PAGE_SIZE; + vvar_page = vvar_start; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + hpet_page = vvar_start + PAGE_SIZE; + + . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + + /* + * Ideally this would live in a C file, but that won't + * work cleanly for x32 until we start building the x32 + * C code using an x32 toolchain. + */ + VDSO_FAKE_SECTION_TABLE_START = .; + . = . + NUM_FAKE_SHDRS * SHDR_SIZE; + VDSO_FAKE_SECTION_TABLE_END = .; + } :text + + .fake_shstrtab : { *(.fake_shstrtab) } :text + + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + + /* + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. + */ + + .text : { *(.text*) } :text =0x90909090, + + /* + * At the end so that eu-elflint stays happy when vdso2c strips + * these. A better implementation would avoid allocating space + * for these. + */ + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + /DISCARD/ : { + *(.discard) + *(.discard.*) + *(__bug_table) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S new file mode 100644 index 0000000..79a071e --- /dev/null +++ b/arch/x86/entry/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S new file mode 100644 index 0000000..6807932 --- /dev/null +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -0,0 +1,29 @@ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSO64 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + time; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c new file mode 100644 index 0000000..8627db2 --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.c @@ -0,0 +1,253 @@ +/* + * vdso2c - A vdso image preparation tool + * Copyright (c) 2014 Andy Lutomirski and others + * Licensed under the GPL v2 + * + * vdso2c requires stripped and unstripped input. It would be trivial + * to fully strip the input in here, but, for reasons described below, + * we need to write a section table. Doing this is more or less + * equivalent to dropping all non-allocatable sections, but it's + * easier to let objcopy handle that instead of doing it ourselves. + * If we ever need to do something fancier than what objcopy provides, + * it would be straightforward to add here. + * + * We're keep a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +const char *outfilename; + +/* Symbols that we need in vdso2c. */ +enum { + sym_vvar_start, + sym_vvar_page, + sym_hpet_page, + sym_VDSO_FAKE_SECTION_TABLE_START, + sym_VDSO_FAKE_SECTION_TABLE_END, +}; + +const int special_pages[] = { + sym_vvar_page, + sym_hpet_page, +}; + +struct vdso_sym { + const char *name; + bool export; +}; + +struct vdso_sym required_syms[] = { + [sym_vvar_start] = {"vvar_start", true}, + [sym_vvar_page] = {"vvar_page", true}, + [sym_hpet_page] = {"hpet_page", true}, + [sym_VDSO_FAKE_SECTION_TABLE_START] = { + "VDSO_FAKE_SECTION_TABLE_START", false + }, + [sym_VDSO_FAKE_SECTION_TABLE_END] = { + "VDSO_FAKE_SECTION_TABLE_END", false + }, + {"VDSO32_NOTE_MASK", true}, + {"VDSO32_SYSENTER_RETURN", true}, + {"__kernel_vsyscall", true}, + {"__kernel_sigreturn", true}, + {"__kernel_rt_sigreturn", true}, +}; + +__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) +static void fail(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + fprintf(stderr, "Error: "); + vfprintf(stderr, format, ap); + if (outfilename) + unlink(outfilename); + exit(1); + va_end(ap); +} + +/* + * Evil macros for little-endian reads and writes + */ +#define GLE(x, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot) + +extern void bad_get_le(void); +#define LAST_GLE(x) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le()) + +#define GET_LE(x) \ + GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x)))) + +#define PLE(x, val, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + put_unaligned_le##bits((val), (x)), ifnot) + +extern void bad_put_le(void); +#define LAST_PLE(x, val) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le()) + +#define PUT_LE(x, val) \ + PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) + + +#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) + +#define BITSFUNC3(name, bits, suffix) name##bits##suffix +#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) + +#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) + +#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x +#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) +#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) + +#define ELF_BITS 64 +#include "vdso2c.h" +#undef ELF_BITS + +#define ELF_BITS 32 +#include "vdso2c.h" +#undef ELF_BITS + +static void go(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; + + if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { + go64(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { + go32(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else { + fail("unknown ELF class\n"); + } +} + +static void map_input(const char *name, void **addr, size_t *len, int prot) +{ + off_t tmp_len; + + int fd = open(name, O_RDONLY); + if (fd == -1) + err(1, "%s", name); + + tmp_len = lseek(fd, 0, SEEK_END); + if (tmp_len == (off_t)-1) + err(1, "lseek"); + *len = (size_t)tmp_len; + + *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); + if (*addr == MAP_FAILED) + err(1, "mmap"); + + close(fd); +} + +int main(int argc, char **argv) +{ + size_t raw_len, stripped_len; + void *raw_addr, *stripped_addr; + FILE *outfile; + char *name, *tmp; + int namelen; + + if (argc != 4) { + printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); + return 1; + } + + /* + * Figure out the struct name. If we're writing to a .so file, + * generate raw output insted. + */ + name = strdup(argv[3]); + namelen = strlen(name); + if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { + name = NULL; + } else { + tmp = strrchr(name, '/'); + if (tmp) + name = tmp + 1; + tmp = strchr(name, '.'); + if (tmp) + *tmp = '\0'; + for (tmp = name; *tmp; tmp++) + if (*tmp == '-') + *tmp = '_'; + } + + map_input(argv[1], &raw_addr, &raw_len, PROT_READ); + map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); + + outfilename = argv[3]; + outfile = fopen(outfilename, "w"); + if (!outfile) + err(1, "%s", argv[2]); + + go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); + + munmap(raw_addr, raw_len); + munmap(stripped_addr, stripped_len); + fclose(outfile); + + return 0; +} diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h new file mode 100644 index 0000000..0224987 --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.h @@ -0,0 +1,175 @@ +/* + * This file is included twice from vdso2c.c. It generates code for 32-bit + * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs + * are built for 32-bit userspace. + */ + +static void BITSFUNC(go)(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + int found_load = 0; + unsigned long load_size = -1; /* Work around bogus warning */ + unsigned long mapping_size; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; + int i; + unsigned long j; + ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + *alt_sec = NULL; + ELF(Dyn) *dyn = 0, *dyn_end = 0; + const char *secstrings; + INT_BITS syms[NSYMS] = {}; + + ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); + + /* Walk the segment table. */ + for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { + if (GET_LE(&pt[i].p_type) == PT_LOAD) { + if (found_load) + fail("multiple PT_LOAD segs\n"); + + if (GET_LE(&pt[i].p_offset) != 0 || + GET_LE(&pt[i].p_vaddr) != 0) + fail("PT_LOAD in wrong place\n"); + + if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz)) + fail("cannot handle memsz != filesz\n"); + + load_size = GET_LE(&pt[i].p_memsz); + found_load = 1; + } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { + dyn = raw_addr + GET_LE(&pt[i].p_offset); + dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + + GET_LE(&pt[i].p_memsz); + } + } + if (!found_load) + fail("no PT_LOAD seg\n"); + + if (stripped_len < load_size) + fail("stripped input is too short\n"); + + /* Walk the dynamic table */ + for (i = 0; dyn + i < dyn_end && + GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { + typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); + if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || + tag == DT_RELENT || tag == DT_TEXTREL) + fail("vdso image contains dynamic relocations\n"); + } + + /* Walk the section table */ + secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); + secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); + for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { + ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * i; + if (GET_LE(&sh->sh_type) == SHT_SYMTAB) + symtab_hdr = sh; + + if (!strcmp(secstrings + GET_LE(&sh->sh_name), + ".altinstructions")) + alt_sec = sh; + } + + if (!symtab_hdr) + fail("no symbol table\n"); + + strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); + + /* Walk the symbol table */ + for (i = 0; + i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); + i++) { + int k; + ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + + GET_LE(&symtab_hdr->sh_entsize) * i; + const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + + GET_LE(&sym->st_name); + + for (k = 0; k < NSYMS; k++) { + if (!strcmp(name, required_syms[k].name)) { + if (syms[k]) { + fail("duplicate symbol %s\n", + required_syms[k].name); + } + + /* + * Careful: we use negative addresses, but + * st_value is unsigned, so we rely + * on syms[k] being a signed type of the + * correct width. + */ + syms[k] = GET_LE(&sym->st_value); + } + } + } + + /* Validate mapping addresses. */ + for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { + INT_BITS symval = syms[special_pages[i]]; + + if (!symval) + continue; /* The mapping isn't used; ignore it. */ + + if (symval % 4096) + fail("%s must be a multiple of 4096\n", + required_syms[i].name); + if (symval + 4096 < syms[sym_vvar_start]) + fail("%s underruns vvar_start\n", + required_syms[i].name); + if (symval + 4096 > 0) + fail("%s is on the wrong side of the vdso text\n", + required_syms[i].name); + } + if (syms[sym_vvar_start] % 4096) + fail("vvar_begin must be a multiple of 4096\n"); + + if (!name) { + fwrite(stripped_addr, stripped_len, 1, outfile); + return; + } + + mapping_size = (stripped_len + 4095) / 4096 * 4096; + + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "\n"); + fprintf(outfile, + "static unsigned char raw_data[%lu] __page_aligned_data = {", + mapping_size); + for (j = 0; j < stripped_len; j++) { + if (j % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", + (int)((unsigned char *)stripped_addr)[j]); + } + fprintf(outfile, "\n};\n\n"); + + fprintf(outfile, "static struct page *pages[%lu];\n\n", + mapping_size / 4096); + + fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "\t.data = raw_data,\n"); + fprintf(outfile, "\t.size = %lu,\n", mapping_size); + fprintf(outfile, "\t.text_mapping = {\n"); + fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); + fprintf(outfile, "\t\t.pages = pages,\n"); + fprintf(outfile, "\t},\n"); + if (alt_sec) { + fprintf(outfile, "\t.alt = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_offset)); + fprintf(outfile, "\t.alt_len = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_size)); + } + for (i = 0; i < NSYMS; i++) { + if (required_syms[i].export && syms[i]) + fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", + required_syms[i].name, (int64_t)syms[i]); + } + fprintf(outfile, "};\n"); +} diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c new file mode 100644 index 0000000..e904c27 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -0,0 +1,120 @@ +/* + * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar + * + * This file contains the needed initializations to support sysenter. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT 0 +#else +#define VDSO_DEFAULT 1 +#endif + +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; + +static int __init vdso32_setup(char *s) +{ + vdso32_enabled = simple_strtoul(s, NULL, 0); + + if (vdso32_enabled > 1) + pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + + return 1; +} + +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso32_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); +#endif + +#ifdef CONFIG_X86_64 + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) + +#else /* CONFIG_X86_32 */ + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) +#define vdso32_syscall() (0) + +#endif /* CONFIG_X86_64 */ + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +const struct vdso_image *selected_vdso32; +#endif + +int __init sysenter_setup(void) +{ +#ifdef CONFIG_COMPAT + if (vdso32_syscall()) + selected_vdso32 = &vdso_image_32_syscall; + else +#endif + if (vdso32_sysenter()) + selected_vdso32 = &vdso_image_32_sysenter; + else + selected_vdso32 = &vdso_image_32_int80; + + init_vdso_image(selected_vdso32); + + return 0; +} + +#ifdef CONFIG_X86_64 + +subsys_initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include + +static struct ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &vdso32_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +static struct ctl_table abi_root_table2[] = { + { + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif /* CONFIG_SYSCTL */ + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore new file mode 100644 index 0000000..e45fba9 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S new file mode 100644 index 0000000..b15b7c0 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/int80.S @@ -0,0 +1,56 @@ +/* + * Code for the vDSO. This version uses the old int $0x80 method. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + int $0x80 + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .previous diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S new file mode 100644 index 0000000..c83f257 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -0,0 +1,44 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include + +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END + +#ifdef CONFIG_XEN +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + * + * At runtime, the fake hardware feature will be considered to be present + * if its bit is set in the mask word. So, we start with the mask 0, and + * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. + */ + +#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ + +ELFNOTE_START(GNU, 2, "a") + .long 1 /* ncaps */ +VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ + .long 0 /* mask */ + .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ +ELFNOTE_END +#endif diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S new file mode 100644 index 0000000..d7ec4e2 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -0,0 +1,145 @@ +/* + * Common code for the sigreturn entry points in vDSO images. + * So far this code is the same for both int80 and sysenter versions. + * This file is #include'd by int80.S et al to define them first thing. + * The kernel assumes that the addresses of these routines are constant + * for all vDSO implementations. + */ + +#include +#include +#include + +#ifndef SYSCALL_ENTER_KERNEL +#define SYSCALL_ENTER_KERNEL int $0x80 +#endif + + .text + .globl __kernel_sigreturn + .type __kernel_sigreturn,@function + nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ + ALIGN +__kernel_sigreturn: +.LSTART_sigreturn: + popl %eax /* XXX does this mean it needs unwind info? */ + movl $__NR_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_sigreturn: + nop + .size __kernel_sigreturn,.-.LSTART_sigreturn + + .globl __kernel_rt_sigreturn + .type __kernel_rt_sigreturn,@function + ALIGN +__kernel_rt_sigreturn: +.LSTART_rt_sigreturn: + movl $__NR_rt_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_rt_sigreturn: + nop + .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI1: + .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 +.LSTARTCIEDLSI1: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zRS" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0 /* DW_CFA_nop */ + .align 4 +.LENDCIEDLSI1: + .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ +.LSTARTFDEDLSI1: + .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: The dwarf2 unwind routines will subtract 1 from the + return address to get an address in the middle of the + presumed call instruction. Since we didn't get here via + a call, we need to include the nop before the real start + to make up for it. */ + .long .LSTART_sigreturn-1-. /* PC-relative start address */ + .long .LEND_sigreturn-.LSTART_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + complicated by the fact that the "CFA" is always assumed to + be the value of the stack pointer in the caller. This means + that we must define the CFA of this body of code to be the + saved value of the stack pointer in the sigcontext. Which + also means that there is no fixed relation to the other + saved registers, which means that we must use DW_CFA_expression + to compute their addresses. It also means that when we + adjust the stack with the popl, we have to do it all over again. */ + +#define do_cfa_expr(offset) \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ + .byte 0x06; /* DW_OP_deref */ \ +1: + +#define do_expr(regno, offset) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ +1: + + do_cfa_expr(IA32_SIGCONTEXT_sp+4) + do_expr(0, IA32_SIGCONTEXT_ax+4) + do_expr(1, IA32_SIGCONTEXT_cx+4) + do_expr(2, IA32_SIGCONTEXT_dx+4) + do_expr(3, IA32_SIGCONTEXT_bx+4) + do_expr(5, IA32_SIGCONTEXT_bp+4) + do_expr(6, IA32_SIGCONTEXT_si+4) + do_expr(7, IA32_SIGCONTEXT_di+4) + do_expr(8, IA32_SIGCONTEXT_ip+4) + + .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ + + do_cfa_expr(IA32_SIGCONTEXT_sp) + do_expr(0, IA32_SIGCONTEXT_ax) + do_expr(1, IA32_SIGCONTEXT_cx) + do_expr(2, IA32_SIGCONTEXT_dx) + do_expr(3, IA32_SIGCONTEXT_bx) + do_expr(5, IA32_SIGCONTEXT_bp) + do_expr(6, IA32_SIGCONTEXT_si) + do_expr(7, IA32_SIGCONTEXT_di) + do_expr(8, IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI1: + + .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ +.LSTARTFDEDLSI2: + .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: See above wrt unwind library assumptions. */ + .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ + .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + slightly less complicated than the above, since we don't + modify the stack pointer in the process. */ + + do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) + do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) + do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) + do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) + do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) + do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) + do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) + do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) + do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI2: + .previous diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S new file mode 100644 index 0000000..6b286bb --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/syscall.S @@ -0,0 +1,75 @@ +/* + * Code for the vDSO. This version uses the syscall instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#define SYSCALL_ENTER_KERNEL syscall +#include "sigreturn.S" + +#include + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + push %ebp +.Lpush_ebp: + movl %ecx, %ebp + syscall + movl %ebp, %ecx + popl %ebp +.Lpop_ebp: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + + .section .eh_frame,"a",@progbits +.LSTARTFRAME: + .long .LENDCIE-.LSTARTCIE +.LSTARTCIE: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIE: + + .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ +.LSTARTFDE1: + .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 /* Augmentation length */ + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 8 + .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ + .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 4 + .align 4 +.LENDFDE1: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 + .previous diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S new file mode 100644 index 0000000..e354bce --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/sysenter.S @@ -0,0 +1,116 @@ +/* + * Code for the vDSO. This version uses the sysenter instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + +/* + * The caller puts arg2 in %ecx, which gets pushed. The kernel will use + * %ecx itself for arg2. The pushing is because the sysexit instruction + * (found in entry.S) requires that we clobber %ecx with the desired %esp. + * User code might expect that %ecx is unclobbered though, as it would be + * for returning via the iret instruction, so we must push and pop. + * + * The caller puts arg3 in %edx, which the sysexit instruction requires + * for %eip. Thus, exactly as for arg2, we must push and pop. + * + * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter + * instruction clobbers %esp, the user's %esp won't even survive entry + * into the kernel. We store %esp in %ebp. Code in entry.S must fetch + * arg6 from the stack. + * + * You can not use this vsyscall for the clone() syscall because the + * three words on the parent stack do not get copied to the child. + */ + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + push %ecx +.Lpush_ecx: + push %edx +.Lpush_edx: + push %ebp +.Lenter_kernel: + movl %esp,%ebp + sysenter + + /* 7: align return point with nop's to make disassembly easier */ + .space 7,0x90 + + /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ + int $0x80 + /* 16: System call normal return point is here! */ +VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ + pop %ebp +.Lpop_ebp: + pop %edx +.Lpop_edx: + pop %ecx +.Lpop_ecx: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x10 /* RA at offset 16 now */ + .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ + /* Finally the epilogue. */ + .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x04 /* RA at offset 4 now */ + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Emit a symbol with the size of this .eh_frame data, + * to verify it matches the other versions. + */ +VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c new file mode 100644 index 0000000..175cc72 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -0,0 +1,30 @@ +#define BUILD_VDSO32 + +#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE +#undef CONFIG_OPTIMIZE_INLINING +#endif + +#undef CONFIG_X86_PPRO_FENCE + +#ifdef CONFIG_X86_64 + +/* + * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel + * configuration + */ +#undef CONFIG_64BIT +#undef CONFIG_X86_64 +#undef CONFIG_ILLEGAL_POINTER_VALUE +#undef CONFIG_SPARSEMEM_VMEMMAP +#undef CONFIG_NR_CPUS + +#define CONFIG_X86_32 1 +#define CONFIG_PAGE_OFFSET 0 +#define CONFIG_ILLEGAL_POINTER_VALUE 0 +#define CONFIG_NR_CPUS 1 + +#define BUILD_VDSO32_64 + +#endif + +#include "../vclock_gettime.c" diff --git a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c new file mode 100644 index 0000000..541468e --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c @@ -0,0 +1 @@ +#include "../vdso-fakesections.c" diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S new file mode 100644 index 0000000..31056cf --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -0,0 +1,37 @@ +/* + * Linker script for 32-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#include + +#define BUILD_VDSO32 + +#include "../vdso-layout.lds.S" + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION +{ + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + }; + + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S new file mode 100644 index 0000000..697c11e --- /dev/null +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -0,0 +1,25 @@ +/* + * Linker script for x32 vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSOX32 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_getcpu; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c new file mode 100644 index 0000000..8ec3d1f --- /dev/null +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -0,0 +1,28 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of getcpu() + */ + +#include +#include +#include +#include + +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + unsigned int p; + + p = __getcpu(); + + if (cpu) + *cpu = p & VGETCPU_CPU_MASK; + if (node) + *node = p >> 12; + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c new file mode 100644 index 0000000..1c9f750 --- /dev/null +++ b/arch/x86/entry/vdso/vma.c @@ -0,0 +1,300 @@ +/* + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + * + * This contains most of the x86 vDSO kernel-side code. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_X86_64) +unsigned int __read_mostly vdso64_enabled = 1; +#endif + +void __init init_vdso_image(const struct vdso_image *image) +{ + int i; + int npages = (image->size) / PAGE_SIZE; + + BUG_ON(image->size % PAGE_SIZE != 0); + for (i = 0; i < npages; i++) + image->text_mapping.pages[i] = + virt_to_page(image->data + i*PAGE_SIZE); + + apply_alternatives((struct alt_instr *)(image->data + image->alt), + (struct alt_instr *)(image->data + image->alt + + image->alt_len)); +} + +struct linux_binprm; + +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ +#ifdef CONFIG_X86_32 + return 0; +#else + unsigned long addr, end; + unsigned offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; + end -= len; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + + /* + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. + */ + addr = align_vdso_addr(addr); + + return addr; +#endif +} + +static int map_vdso(const struct vdso_image *image, bool calculate_addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long addr, text_start; + int ret = 0; + static struct page *no_pages[] = {NULL}; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .pages = no_pages, + }; + + if (calculate_addr) { + addr = vdso_addr(current->mm->start_stack, + image->size - image->sym_vvar_start); + } else { + addr = 0; + } + + down_write(&mm->mmap_sem); + + addr = get_unmapped_area(NULL, addr, + image->size - image->sym_vvar_start, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + text_start = addr - image->sym_vvar_start; + current->mm->context.vdso = (void __user *)text_start; + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + vma = _install_special_mapping(mm, + text_start, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &image->text_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + vma = _install_special_mapping(mm, + addr, + -image->sym_vvar_start, + VM_READ|VM_MAYREAD, + &vvar_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + if (image->sym_vvar_page) + ret = remap_pfn_range(vma, + text_start + image->sym_vvar_page, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + +#ifdef CONFIG_HPET_TIMER + if (hpet_address && image->sym_hpet_page) { + ret = io_remap_pfn_range(vma, + text_start + image->sym_hpet_page, + hpet_address >> PAGE_SHIFT, + PAGE_SIZE, + pgprot_noncached(PAGE_READONLY)); + + if (ret) + goto up_fail; + } +#endif + +up_fail: + if (ret) + current->mm->context.vdso = NULL; + + up_write(&mm->mmap_sem); + return ret; +} + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +static int load_vdso32(void) +{ + int ret; + + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ + return 0; + + ret = map_vdso(selected_vdso32, false); + if (ret) + return ret; + + if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) + current_thread_info()->sysenter_return = + current->mm->context.vdso + + selected_vdso32->sym_VDSO32_SYSENTER_RETURN; + + return 0; +} +#endif + +#ifdef CONFIG_X86_64 +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_64, true); +} + +#ifdef CONFIG_COMPAT +int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) +{ +#ifdef CONFIG_X86_X32_ABI + if (test_thread_flag(TIF_X32)) { + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_x32, true); + } +#endif + + return load_vdso32(); +} +#endif +#else +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + return load_vdso32(); +} +#endif + +#ifdef CONFIG_X86_64 +static __init int vdso_setup(char *s) +{ + vdso64_enabled = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("vdso=", vdso_setup); +#endif + +#ifdef CONFIG_X86_64 +static void vgetcpu_cpu_init(void *arg) +{ + int cpu = smp_processor_id(); + struct desc_struct d = { }; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* + * Store cpu number in limit so that it can be loaded + * quickly in user space in vgetcpu. (12 bits for the CPU + * and 8 bits for the node) + */ + d.limit0 = cpu | ((node & 0xf) << 12); + d.limit = node >> 4; + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static int +vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); + + return NOTIFY_DONE; +} + +static int __init init_vdso(void) +{ + init_vdso_image(&vdso_image_64); + +#ifdef CONFIG_X86_X32_ABI + init_vdso_image(&vdso_image_x32); +#endif + + cpu_notifier_register_begin(); + + on_each_cpu(vgetcpu_cpu_init, NULL, 1); + /* notifier priority > KVM */ + __hotcpu_notifier(vgetcpu_cpu_notifier, 30); + + cpu_notifier_register_done(); + + return 0; +} +subsys_initcall(init_vdso); +#endif /* CONFIG_X86_64 */ -- cgit v1.1 From e6b93f4e48af8fe8373ec1132b8c7787890e7578 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:10:43 +0200 Subject: x86/asm/entry: Move the 'thunk' functions to arch/x86/entry/ These are all calling x86 entry code functions, so move them close to other entry code. Change lib-y to obj-y: there's no real difference between the two as we don't really drop any of them during the linking stage, and obj-y is the more common approach for core kernel object code. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 3 +-- arch/x86/entry/thunk_32.S | 42 +++++++++++++++++++++++++++++ arch/x86/entry/thunk_64.S | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 arch/x86/entry/thunk_32.S create mode 100644 arch/x86/entry/thunk_64.S (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index c975324..9df72c8 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,7 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o - +obj-y := entry_$(BITS).o thunk_$(BITS).o obj-y += vdso/ obj-$(CONFIG_IA32_EMULATION) += ia32entry.o diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S new file mode 100644 index 0000000..e9acf5f --- /dev/null +++ b/arch/x86/entry/thunk_32.S @@ -0,0 +1,42 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + #include + #include + + /* put return address in eax (arg1) */ + .macro THUNK name, func, put_ret_addr_in_eax=0 + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + ret + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context +#endif +#endif + diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S new file mode 100644 index 0000000..10f555e --- /dev/null +++ b/arch/x86/entry/thunk_64.S @@ -0,0 +1,69 @@ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ +#include +#include +#include + + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro THUNK name, func, put_ret_addr_in_rdi=0 + .globl \name +\name: + + /* this one pushes 9 elems, the next one would be %rIP */ + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + .if \put_ret_addr_in_rdi + /* 9*8(%rsp) is return addr on stack */ + movq 9*8(%rsp), %rdi + .endif + + call \func + jmp restore + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + THUNK lockdep_sys_exit_thunk,lockdep_sys_exit +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context +#endif +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) \ + || defined(CONFIG_DEBUG_LOCK_ALLOC) \ + || defined(CONFIG_PREEMPT) +restore: + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + ret + _ASM_NOKPROBE(restore) +#endif -- cgit v1.1 From d36f947904dba0935a0e74dc9df2ef57caeb7d03 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:29:26 +0200 Subject: x86/asm/entry: Move arch/x86/include/asm/calling.h to arch/x86/entry/ asm/calling.h is private to the entry code, make this more apparent by moving it to the new arch/x86/entry/ directory. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 243 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_64.S | 2 +- arch/x86/entry/ia32entry.S | 2 +- arch/x86/entry/thunk_64.S | 2 +- 4 files changed, 246 insertions(+), 3 deletions(-) create mode 100644 arch/x86/entry/calling.h (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h new file mode 100644 index 0000000..f4e6308 --- /dev/null +++ b/arch/x86/entry/calling.h @@ -0,0 +1,243 @@ +/* + + x86 function call convention, 64-bit: + ------------------------------------- + arguments | callee-saved | extra caller-saved | return + [callee-clobbered] | | [callee-clobbered] | + --------------------------------------------------------------------------- + rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**] + + ( rsp is obviously invariant across normal function calls. (gcc can 'merge' + functions when it sees tail-call optimization possibilities) rflags is + clobbered. Leftover arguments are passed over the stack frame.) + + [*] In the frame-pointers case rbp is fixed to the stack frame. + + [**] for struct return values wider than 64 bits the return convention is a + bit more complex: up to 128 bits width we return small structures + straight in rax, rdx. For structures larger than that (3 words or + larger) the caller puts a pointer to an on-stack return struct + [allocated in the caller's stack frame] into the first argument - i.e. + into rdi. All other arguments shift up by one in this case. + Fortunately this case is rare in the kernel. + +For 32-bit we have the following conventions - kernel is built with +-mregparm=3 and -freg-struct-return: + + x86 function calling convention, 32-bit: + ---------------------------------------- + arguments | callee-saved | extra caller-saved | return + [callee-clobbered] | | [callee-clobbered] | + ------------------------------------------------------------------------- + eax edx ecx | ebx edi esi ebp [*] | | eax, edx [**] + + ( here too esp is obviously invariant across normal function calls. eflags + is clobbered. Leftover arguments are passed over the stack frame. ) + + [*] In the frame-pointers case ebp is fixed to the stack frame. + + [**] We build with -freg-struct-return, which on 32-bit means similar + semantics as on 64-bit: edx can be used for a second return value + (i.e. covering integer and structure sizes up to 64 bits) - after that + it gets more complex and more expensive: 3-word or larger struct returns + get done in the caller's frame and the pointer to the return struct goes + into regparm0, i.e. eax - the other arguments shift up and the + function's register parameters degenerate to regparm=2 in essence. + +*/ + +#ifdef CONFIG_X86_64 + +/* + * 64-bit system call stack frame layout defines and helpers, + * for assembly code: + */ + +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 + +#define SIZEOF_PTREGS 21*8 + + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 + addq $-(15*8+\addskip), %rsp + .endm + + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 + .if \r11 + movq %r11, 6*8+\offset(%rsp) + .endif + .if \r8910 + movq %r10, 7*8+\offset(%rsp) + movq %r9, 8*8+\offset(%rsp) + movq %r8, 9*8+\offset(%rsp) + .endif + .if \rax + movq %rax, 10*8+\offset(%rsp) + .endif + .if \rcx + movq %rcx, 11*8+\offset(%rsp) + .endif + movq %rdx, 12*8+\offset(%rsp) + movq %rsi, 13*8+\offset(%rsp) + movq %rdi, 14*8+\offset(%rsp) + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 + SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_R891011 + SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RCX_R891011 + SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 + .endm + + .macro SAVE_EXTRA_REGS offset=0 + movq %r15, 0*8+\offset(%rsp) + movq %r14, 1*8+\offset(%rsp) + movq %r13, 2*8+\offset(%rsp) + movq %r12, 3*8+\offset(%rsp) + movq %rbp, 4*8+\offset(%rsp) + movq %rbx, 5*8+\offset(%rsp) + .endm + .macro SAVE_EXTRA_REGS_RBP offset=0 + movq %rbp, 4*8+\offset(%rsp) + .endm + + .macro RESTORE_EXTRA_REGS offset=0 + movq 0*8+\offset(%rsp), %r15 + movq 1*8+\offset(%rsp), %r14 + movq 2*8+\offset(%rsp), %r13 + movq 3*8+\offset(%rsp), %r12 + movq 4*8+\offset(%rsp), %rbp + movq 5*8+\offset(%rsp), %rbx + .endm + + .macro ZERO_EXTRA_REGS + xorl %r15d, %r15d + xorl %r14d, %r14d + xorl %r13d, %r13d + xorl %r12d, %r12d + xorl %ebp, %ebp + xorl %ebx, %ebx + .endm + + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 + .if \rstor_r11 + movq 6*8(%rsp), %r11 + .endif + .if \rstor_r8910 + movq 7*8(%rsp), %r10 + movq 8*8(%rsp), %r9 + movq 9*8(%rsp), %r8 + .endif + .if \rstor_rax + movq 10*8(%rsp), %rax + .endif + .if \rstor_rcx + movq 11*8(%rsp), %rcx + .endif + .if \rstor_rdx + movq 12*8(%rsp), %rdx + .endif + movq 13*8(%rsp), %rsi + movq 14*8(%rsp), %rdi + .endm + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RAX + RESTORE_C_REGS_HELPER 0,1,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX + RESTORE_C_REGS_HELPER 1,0,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_R11 + RESTORE_C_REGS_HELPER 1,1,0,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX_R11 + RESTORE_C_REGS_HELPER 1,0,0,1,1 + .endm + .macro RESTORE_RSI_RDI + RESTORE_C_REGS_HELPER 0,0,0,0,0 + .endm + .macro RESTORE_RSI_RDI_RDX + RESTORE_C_REGS_HELPER 0,0,0,0,1 + .endm + + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 + subq $-(15*8+\addskip), %rsp + .endm + + .macro icebp + .byte 0xf1 + .endm + +#else /* CONFIG_X86_64 */ + +/* + * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These + * are different from the entry_32.S versions in not changing the segment + * registers. So only suitable for in kernel use, not when transitioning + * from or to user space. The resulting stack frame is not a standard + * pt_regs frame. The main use case is calling C code from assembler + * when all the registers need to be preserved. + */ + + .macro SAVE_ALL + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + .endm + + .macro RESTORE_ALL + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + .endm + +#endif /* CONFIG_X86_64 */ + diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4ad79e9..f7380ea 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -28,7 +28,7 @@ #include #include #include -#include +#include "calling.h" #include #include #include diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 2be23c7..f167674 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -4,7 +4,7 @@ * Copyright 2000-2002 Andi Kleen, SuSE Labs. */ -#include +#include "calling.h" #include #include #include diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 10f555e..3e95681 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -6,7 +6,7 @@ * Subject to the GNU public license, v.2. No warranty of any kind. */ #include -#include +#include "calling.h" #include /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ -- cgit v1.1 From 1f57d5d85ba7f1f467173ff33f51d01a91f9aaf1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:36:41 +0200 Subject: x86/asm/entry: Move the arch/x86/syscalls/ definitions to arch/x86/entry/syscalls/ The build time generated syscall definitions are entry code related, move them into the arch/x86/entry/ directory. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscalls/Makefile | 69 ++++++ arch/x86/entry/syscalls/syscall_32.tbl | 367 ++++++++++++++++++++++++++++++++ arch/x86/entry/syscalls/syscall_64.tbl | 370 +++++++++++++++++++++++++++++++++ arch/x86/entry/syscalls/syscallhdr.sh | 27 +++ arch/x86/entry/syscalls/syscalltbl.sh | 15 ++ 5 files changed, 848 insertions(+) create mode 100644 arch/x86/entry/syscalls/Makefile create mode 100644 arch/x86/entry/syscalls/syscall_32.tbl create mode 100644 arch/x86/entry/syscalls/syscall_64.tbl create mode 100644 arch/x86/entry/syscalls/syscallhdr.sh create mode 100644 arch/x86/entry/syscalls/syscalltbl.sh (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile new file mode 100644 index 0000000..57aa59f --- /dev/null +++ b/arch/x86/entry/syscalls/Makefile @@ -0,0 +1,69 @@ +out := $(obj)/../../include/generated/asm +uapi := $(obj)/../../include/generated/uapi/asm + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ + $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') + +syscall32 := $(srctree)/$(src)/syscall_32.tbl +syscall64 := $(srctree)/$(src)/syscall_64.tbl + +syshdr := $(srctree)/$(src)/syscallhdr.sh +systbl := $(srctree)/$(src)/syscalltbl.sh + +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ + '$(syshdr_abi_$(basetarget))' \ + '$(syshdr_pfx_$(basetarget))' \ + '$(syshdr_offset_$(basetarget))' +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + +quiet_cmd_hypercalls = HYPERCALLS $@ + cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) + +syshdr_abi_unistd_32 := i386 +$(uapi)/unistd_32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_32_ia32 := i386 +syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_x32 := common,x32 +syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT +$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64_x32 := x32 +syshdr_pfx_unistd_64_x32 := x32_ +$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +$(out)/syscalls_32.h: $(syscall32) $(systbl) + $(call if_changed,systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) + $(call if_changed,systbl) + +$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh + $(call if_changed,hypercalls) + +$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h + +uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h +syshdr-y += syscalls_32.h +syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h +syshdr-$(CONFIG_X86_64) += syscalls_64.h +syshdr-$(CONFIG_XEN) += xen-hypercalls.h + +targets += $(uapisyshdr-y) $(syshdr-y) + +PHONY += all +all: $(addprefix $(uapi)/,$(uapisyshdr-y)) +all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl new file mode 100644 index 0000000..ef8187f --- /dev/null +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -0,0 +1,367 @@ +# +# 32-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is always "i386" for this file. +# +0 i386 restart_syscall sys_restart_syscall +1 i386 exit sys_exit +2 i386 fork sys_fork stub32_fork +3 i386 read sys_read +4 i386 write sys_write +5 i386 open sys_open compat_sys_open +6 i386 close sys_close +7 i386 waitpid sys_waitpid sys32_waitpid +8 i386 creat sys_creat +9 i386 link sys_link +10 i386 unlink sys_unlink +11 i386 execve sys_execve stub32_execve +12 i386 chdir sys_chdir +13 i386 time sys_time compat_sys_time +14 i386 mknod sys_mknod +15 i386 chmod sys_chmod +16 i386 lchown sys_lchown16 +17 i386 break +18 i386 oldstat sys_stat +19 i386 lseek sys_lseek compat_sys_lseek +20 i386 getpid sys_getpid +21 i386 mount sys_mount compat_sys_mount +22 i386 umount sys_oldumount +23 i386 setuid sys_setuid16 +24 i386 getuid sys_getuid16 +25 i386 stime sys_stime compat_sys_stime +26 i386 ptrace sys_ptrace compat_sys_ptrace +27 i386 alarm sys_alarm +28 i386 oldfstat sys_fstat +29 i386 pause sys_pause +30 i386 utime sys_utime compat_sys_utime +31 i386 stty +32 i386 gtty +33 i386 access sys_access +34 i386 nice sys_nice +35 i386 ftime +36 i386 sync sys_sync +37 i386 kill sys_kill +38 i386 rename sys_rename +39 i386 mkdir sys_mkdir +40 i386 rmdir sys_rmdir +41 i386 dup sys_dup +42 i386 pipe sys_pipe +43 i386 times sys_times compat_sys_times +44 i386 prof +45 i386 brk sys_brk +46 i386 setgid sys_setgid16 +47 i386 getgid sys_getgid16 +48 i386 signal sys_signal +49 i386 geteuid sys_geteuid16 +50 i386 getegid sys_getegid16 +51 i386 acct sys_acct +52 i386 umount2 sys_umount +53 i386 lock +54 i386 ioctl sys_ioctl compat_sys_ioctl +55 i386 fcntl sys_fcntl compat_sys_fcntl64 +56 i386 mpx +57 i386 setpgid sys_setpgid +58 i386 ulimit +59 i386 oldolduname sys_olduname +60 i386 umask sys_umask +61 i386 chroot sys_chroot +62 i386 ustat sys_ustat compat_sys_ustat +63 i386 dup2 sys_dup2 +64 i386 getppid sys_getppid +65 i386 getpgrp sys_getpgrp +66 i386 setsid sys_setsid +67 i386 sigaction sys_sigaction compat_sys_sigaction +68 i386 sgetmask sys_sgetmask +69 i386 ssetmask sys_ssetmask +70 i386 setreuid sys_setreuid16 +71 i386 setregid sys_setregid16 +72 i386 sigsuspend sys_sigsuspend sys_sigsuspend +73 i386 sigpending sys_sigpending compat_sys_sigpending +74 i386 sethostname sys_sethostname +75 i386 setrlimit sys_setrlimit compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 +81 i386 setgroups sys_setgroups16 +82 i386 select sys_old_select compat_sys_old_select +83 i386 symlink sys_symlink +84 i386 oldlstat sys_lstat +85 i386 readlink sys_readlink +86 i386 uselib sys_uselib +87 i386 swapon sys_swapon +88 i386 reboot sys_reboot +89 i386 readdir sys_old_readdir compat_sys_old_readdir +90 i386 mmap sys_old_mmap sys32_mmap +91 i386 munmap sys_munmap +92 i386 truncate sys_truncate compat_sys_truncate +93 i386 ftruncate sys_ftruncate compat_sys_ftruncate +94 i386 fchmod sys_fchmod +95 i386 fchown sys_fchown16 +96 i386 getpriority sys_getpriority +97 i386 setpriority sys_setpriority +98 i386 profil +99 i386 statfs sys_statfs compat_sys_statfs +100 i386 fstatfs sys_fstatfs compat_sys_fstatfs +101 i386 ioperm sys_ioperm +102 i386 socketcall sys_socketcall compat_sys_socketcall +103 i386 syslog sys_syslog +104 i386 setitimer sys_setitimer compat_sys_setitimer +105 i386 getitimer sys_getitimer compat_sys_getitimer +106 i386 stat sys_newstat compat_sys_newstat +107 i386 lstat sys_newlstat compat_sys_newlstat +108 i386 fstat sys_newfstat compat_sys_newfstat +109 i386 olduname sys_uname +110 i386 iopl sys_iopl +111 i386 vhangup sys_vhangup +112 i386 idle +113 i386 vm86old sys_vm86old sys_ni_syscall +114 i386 wait4 sys_wait4 compat_sys_wait4 +115 i386 swapoff sys_swapoff +116 i386 sysinfo sys_sysinfo compat_sys_sysinfo +117 i386 ipc sys_ipc compat_sys_ipc +118 i386 fsync sys_fsync +119 i386 sigreturn sys_sigreturn stub32_sigreturn +120 i386 clone sys_clone stub32_clone +121 i386 setdomainname sys_setdomainname +122 i386 uname sys_newuname +123 i386 modify_ldt sys_modify_ldt +124 i386 adjtimex sys_adjtimex compat_sys_adjtimex +125 i386 mprotect sys_mprotect +126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 i386 create_module +128 i386 init_module sys_init_module +129 i386 delete_module sys_delete_module +130 i386 get_kernel_syms +131 i386 quotactl sys_quotactl sys32_quotactl +132 i386 getpgid sys_getpgid +133 i386 fchdir sys_fchdir +134 i386 bdflush sys_bdflush +135 i386 sysfs sys_sysfs +136 i386 personality sys_personality +137 i386 afs_syscall +138 i386 setfsuid sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 +140 i386 _llseek sys_llseek +141 i386 getdents sys_getdents compat_sys_getdents +142 i386 _newselect sys_select compat_sys_select +143 i386 flock sys_flock +144 i386 msync sys_msync +145 i386 readv sys_readv compat_sys_readv +146 i386 writev sys_writev compat_sys_writev +147 i386 getsid sys_getsid +148 i386 fdatasync sys_fdatasync +149 i386 _sysctl sys_sysctl compat_sys_sysctl +150 i386 mlock sys_mlock +151 i386 munlock sys_munlock +152 i386 mlockall sys_mlockall +153 i386 munlockall sys_munlockall +154 i386 sched_setparam sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep compat_sys_nanosleep +163 i386 mremap sys_mremap +164 i386 setresuid sys_setresuid16 +165 i386 getresuid sys_getresuid16 +166 i386 vm86 sys_vm86 sys_ni_syscall +167 i386 query_module +168 i386 poll sys_poll +169 i386 nfsservctl +170 i386 setresgid sys_setresgid16 +171 i386 getresgid sys_getresgid16 +172 i386 prctl sys_prctl +173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend +180 i386 pread64 sys_pread64 sys32_pread +181 i386 pwrite64 sys_pwrite64 sys32_pwrite +182 i386 chown sys_chown16 +183 i386 getcwd sys_getcwd +184 i386 capget sys_capget +185 i386 capset sys_capset +186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack +187 i386 sendfile sys_sendfile compat_sys_sendfile +188 i386 getpmsg +189 i386 putpmsg +190 i386 vfork sys_vfork stub32_vfork +191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 sys32_truncate64 +194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 +195 i386 stat64 sys_stat64 sys32_stat64 +196 i386 lstat64 sys_lstat64 sys32_lstat64 +197 i386 fstat64 sys_fstat64 sys32_fstat64 +198 i386 lchown32 sys_lchown +199 i386 getuid32 sys_getuid +200 i386 getgid32 sys_getgid +201 i386 geteuid32 sys_geteuid +202 i386 getegid32 sys_getegid +203 i386 setreuid32 sys_setreuid +204 i386 setregid32 sys_setregid +205 i386 getgroups32 sys_getgroups +206 i386 setgroups32 sys_setgroups +207 i386 fchown32 sys_fchown +208 i386 setresuid32 sys_setresuid +209 i386 getresuid32 sys_getresuid +210 i386 setresgid32 sys_setresgid +211 i386 getresgid32 sys_getresgid +212 i386 chown32 sys_chown +213 i386 setuid32 sys_setuid +214 i386 setgid32 sys_setgid +215 i386 setfsuid32 sys_setfsuid +216 i386 setfsgid32 sys_setfsgid +217 i386 pivot_root sys_pivot_root +218 i386 mincore sys_mincore +219 i386 madvise sys_madvise +220 i386 getdents64 sys_getdents64 compat_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +# 222 is unused +# 223 is unused +224 i386 gettid sys_gettid +225 i386 readahead sys_readahead sys32_readahead +226 i386 setxattr sys_setxattr +227 i386 lsetxattr sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr +229 i386 getxattr sys_getxattr +230 i386 lgetxattr sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr +232 i386 listxattr sys_listxattr +233 i386 llistxattr sys_llistxattr +234 i386 flistxattr sys_flistxattr +235 i386 removexattr sys_removexattr +236 i386 lremovexattr sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr +238 i386 tkill sys_tkill +239 i386 sendfile64 sys_sendfile64 +240 i386 futex sys_futex compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area +245 i386 io_setup sys_io_setup compat_sys_io_setup +246 i386 io_destroy sys_io_destroy +247 i386 io_getevents sys_io_getevents compat_sys_io_getevents +248 i386 io_submit sys_io_submit compat_sys_io_submit +249 i386 io_cancel sys_io_cancel +250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 +# 251 is available for reuse (was briefly sys_set_zone_reclaim) +252 i386 exit_group sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +254 i386 epoll_create sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address +259 i386 timer_create sys_timer_create compat_sys_timer_create +260 i386 timer_settime sys_timer_settime compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete +264 i386 clock_settime sys_clock_settime compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill +271 i386 utimes sys_utimes compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +273 i386 vserver +274 i386 mbind sys_mbind +275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy +277 i386 mq_open sys_mq_open compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify compat_sys_mq_notify +282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load compat_sys_kexec_load +284 i386 waitid sys_waitid compat_sys_waitid +# 285 sys_setaltroot +286 i386 add_key sys_add_key +287 i386 request_key sys_request_key +288 i386 keyctl sys_keyctl +289 i386 ioprio_set sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get +291 i386 inotify_init sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages +295 i386 openat sys_openat compat_sys_openat +296 i386 mkdirat sys_mkdirat +297 i386 mknodat sys_mknodat +298 i386 fchownat sys_fchownat +299 i386 futimesat sys_futimesat compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 sys32_fstatat +301 i386 unlinkat sys_unlinkat +302 i386 renameat sys_renameat +303 i386 linkat sys_linkat +304 i386 symlinkat sys_symlinkat +305 i386 readlinkat sys_readlinkat +306 i386 fchmodat sys_fchmodat +307 i386 faccessat sys_faccessat +308 i386 pselect6 sys_pselect6 compat_sys_pselect6 +309 i386 ppoll sys_ppoll compat_sys_ppoll +310 i386 unshare sys_unshare +311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list +313 i386 splice sys_splice +314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range +315 i386 tee sys_tee +316 i386 vmsplice sys_vmsplice compat_sys_vmsplice +317 i386 move_pages sys_move_pages compat_sys_move_pages +318 i386 getcpu sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait +320 i386 utimensat sys_utimensat compat_sys_utimensat +321 i386 signalfd sys_signalfd compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create +323 i386 eventfd sys_eventfd +324 i386 fallocate sys_fallocate sys32_fallocate +325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 +330 i386 dup3 sys_dup3 +331 i386 pipe2 sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 +333 i386 preadv sys_preadv compat_sys_preadv +334 i386 pwritev sys_pwritev compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +340 i386 prlimit64 sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs +345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg +346 i386 setns sys_setns +347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp +350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 +354 i386 seccomp sys_seccomp +355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf +358 i386 execveat sys_execveat stub32_execveat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl new file mode 100644 index 0000000..9ef32d5 --- /dev/null +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -0,0 +1,370 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is "common", "64" or "x32" for this file. +# +0 common read sys_read +1 common write sys_write +2 common open sys_open +3 common close sys_close +4 common stat sys_newstat +5 common fstat sys_newfstat +6 common lstat sys_newlstat +7 common poll sys_poll +8 common lseek sys_lseek +9 common mmap sys_mmap +10 common mprotect sys_mprotect +11 common munmap sys_munmap +12 common brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 common rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn stub_rt_sigreturn +16 64 ioctl sys_ioctl +17 common pread64 sys_pread64 +18 common pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 common access sys_access +22 common pipe sys_pipe +23 common select sys_select +24 common sched_yield sys_sched_yield +25 common mremap sys_mremap +26 common msync sys_msync +27 common mincore sys_mincore +28 common madvise sys_madvise +29 common shmget sys_shmget +30 common shmat sys_shmat +31 common shmctl sys_shmctl +32 common dup sys_dup +33 common dup2 sys_dup2 +34 common pause sys_pause +35 common nanosleep sys_nanosleep +36 common getitimer sys_getitimer +37 common alarm sys_alarm +38 common setitimer sys_setitimer +39 common getpid sys_getpid +40 common sendfile sys_sendfile64 +41 common socket sys_socket +42 common connect sys_connect +43 common accept sys_accept +44 common sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 common shutdown sys_shutdown +49 common bind sys_bind +50 common listen sys_listen +51 common getsockname sys_getsockname +52 common getpeername sys_getpeername +53 common socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 common clone stub_clone +57 common fork stub_fork +58 common vfork stub_vfork +59 64 execve stub_execve +60 common exit sys_exit +61 common wait4 sys_wait4 +62 common kill sys_kill +63 common uname sys_newuname +64 common semget sys_semget +65 common semop sys_semop +66 common semctl sys_semctl +67 common shmdt sys_shmdt +68 common msgget sys_msgget +69 common msgsnd sys_msgsnd +70 common msgrcv sys_msgrcv +71 common msgctl sys_msgctl +72 common fcntl sys_fcntl +73 common flock sys_flock +74 common fsync sys_fsync +75 common fdatasync sys_fdatasync +76 common truncate sys_truncate +77 common ftruncate sys_ftruncate +78 common getdents sys_getdents +79 common getcwd sys_getcwd +80 common chdir sys_chdir +81 common fchdir sys_fchdir +82 common rename sys_rename +83 common mkdir sys_mkdir +84 common rmdir sys_rmdir +85 common creat sys_creat +86 common link sys_link +87 common unlink sys_unlink +88 common symlink sys_symlink +89 common readlink sys_readlink +90 common chmod sys_chmod +91 common fchmod sys_fchmod +92 common chown sys_chown +93 common fchown sys_fchown +94 common lchown sys_lchown +95 common umask sys_umask +96 common gettimeofday sys_gettimeofday +97 common getrlimit sys_getrlimit +98 common getrusage sys_getrusage +99 common sysinfo sys_sysinfo +100 common times sys_times +101 64 ptrace sys_ptrace +102 common getuid sys_getuid +103 common syslog sys_syslog +104 common getgid sys_getgid +105 common setuid sys_setuid +106 common setgid sys_setgid +107 common geteuid sys_geteuid +108 common getegid sys_getegid +109 common setpgid sys_setpgid +110 common getppid sys_getppid +111 common getpgrp sys_getpgrp +112 common setsid sys_setsid +113 common setreuid sys_setreuid +114 common setregid sys_setregid +115 common getgroups sys_getgroups +116 common setgroups sys_setgroups +117 common setresuid sys_setresuid +118 common getresuid sys_getresuid +119 common setresgid sys_setresgid +120 common getresgid sys_getresgid +121 common getpgid sys_getpgid +122 common setfsuid sys_setfsuid +123 common setfsgid sys_setfsgid +124 common getsid sys_getsid +125 common capget sys_capget +126 common capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 common rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack sys_sigaltstack +132 common utime sys_utime +133 common mknod sys_mknod +134 64 uselib +135 common personality sys_personality +136 common ustat sys_ustat +137 common statfs sys_statfs +138 common fstatfs sys_fstatfs +139 common sysfs sys_sysfs +140 common getpriority sys_getpriority +141 common setpriority sys_setpriority +142 common sched_setparam sys_sched_setparam +143 common sched_getparam sys_sched_getparam +144 common sched_setscheduler sys_sched_setscheduler +145 common sched_getscheduler sys_sched_getscheduler +146 common sched_get_priority_max sys_sched_get_priority_max +147 common sched_get_priority_min sys_sched_get_priority_min +148 common sched_rr_get_interval sys_sched_rr_get_interval +149 common mlock sys_mlock +150 common munlock sys_munlock +151 common mlockall sys_mlockall +152 common munlockall sys_munlockall +153 common vhangup sys_vhangup +154 common modify_ldt sys_modify_ldt +155 common pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 common prctl sys_prctl +158 common arch_prctl sys_arch_prctl +159 common adjtimex sys_adjtimex +160 common setrlimit sys_setrlimit +161 common chroot sys_chroot +162 common sync sys_sync +163 common acct sys_acct +164 common settimeofday sys_settimeofday +165 common mount sys_mount +166 common umount2 sys_umount +167 common swapon sys_swapon +168 common swapoff sys_swapoff +169 common reboot sys_reboot +170 common sethostname sys_sethostname +171 common setdomainname sys_setdomainname +172 common iopl sys_iopl +173 common ioperm sys_ioperm +174 64 create_module +175 common init_module sys_init_module +176 common delete_module sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 common quotactl sys_quotactl +180 64 nfsservctl +181 common getpmsg +182 common putpmsg +183 common afs_syscall +184 common tuxcall +185 common security +186 common gettid sys_gettid +187 common readahead sys_readahead +188 common setxattr sys_setxattr +189 common lsetxattr sys_lsetxattr +190 common fsetxattr sys_fsetxattr +191 common getxattr sys_getxattr +192 common lgetxattr sys_lgetxattr +193 common fgetxattr sys_fgetxattr +194 common listxattr sys_listxattr +195 common llistxattr sys_llistxattr +196 common flistxattr sys_flistxattr +197 common removexattr sys_removexattr +198 common lremovexattr sys_lremovexattr +199 common fremovexattr sys_fremovexattr +200 common tkill sys_tkill +201 common time sys_time +202 common futex sys_futex +203 common sched_setaffinity sys_sched_setaffinity +204 common sched_getaffinity sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup sys_io_setup +207 common io_destroy sys_io_destroy +208 common io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 common io_cancel sys_io_cancel +211 64 get_thread_area +212 common lookup_dcookie sys_lookup_dcookie +213 common epoll_create sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 common remap_file_pages sys_remap_file_pages +217 common getdents64 sys_getdents64 +218 common set_tid_address sys_set_tid_address +219 common restart_syscall sys_restart_syscall +220 common semtimedop sys_semtimedop +221 common fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 common timer_settime sys_timer_settime +224 common timer_gettime sys_timer_gettime +225 common timer_getoverrun sys_timer_getoverrun +226 common timer_delete sys_timer_delete +227 common clock_settime sys_clock_settime +228 common clock_gettime sys_clock_gettime +229 common clock_getres sys_clock_getres +230 common clock_nanosleep sys_clock_nanosleep +231 common exit_group sys_exit_group +232 common epoll_wait sys_epoll_wait +233 common epoll_ctl sys_epoll_ctl +234 common tgkill sys_tgkill +235 common utimes sys_utimes +236 64 vserver +237 common mbind sys_mbind +238 common set_mempolicy sys_set_mempolicy +239 common get_mempolicy sys_get_mempolicy +240 common mq_open sys_mq_open +241 common mq_unlink sys_mq_unlink +242 common mq_timedsend sys_mq_timedsend +243 common mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 common mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 common add_key sys_add_key +249 common request_key sys_request_key +250 common keyctl sys_keyctl +251 common ioprio_set sys_ioprio_set +252 common ioprio_get sys_ioprio_get +253 common inotify_init sys_inotify_init +254 common inotify_add_watch sys_inotify_add_watch +255 common inotify_rm_watch sys_inotify_rm_watch +256 common migrate_pages sys_migrate_pages +257 common openat sys_openat +258 common mkdirat sys_mkdirat +259 common mknodat sys_mknodat +260 common fchownat sys_fchownat +261 common futimesat sys_futimesat +262 common newfstatat sys_newfstatat +263 common unlinkat sys_unlinkat +264 common renameat sys_renameat +265 common linkat sys_linkat +266 common symlinkat sys_symlinkat +267 common readlinkat sys_readlinkat +268 common fchmodat sys_fchmodat +269 common faccessat sys_faccessat +270 common pselect6 sys_pselect6 +271 common ppoll sys_ppoll +272 common unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 common splice sys_splice +276 common tee sys_tee +277 common sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 common utimensat sys_utimensat +281 common epoll_pwait sys_epoll_pwait +282 common signalfd sys_signalfd +283 common timerfd_create sys_timerfd_create +284 common eventfd sys_eventfd +285 common fallocate sys_fallocate +286 common timerfd_settime sys_timerfd_settime +287 common timerfd_gettime sys_timerfd_gettime +288 common accept4 sys_accept4 +289 common signalfd4 sys_signalfd4 +290 common eventfd2 sys_eventfd2 +291 common epoll_create1 sys_epoll_create1 +292 common dup3 sys_dup3 +293 common pipe2 sys_pipe2 +294 common inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 common perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 common fanotify_init sys_fanotify_init +301 common fanotify_mark sys_fanotify_mark +302 common prlimit64 sys_prlimit64 +303 common name_to_handle_at sys_name_to_handle_at +304 common open_by_handle_at sys_open_by_handle_at +305 common clock_adjtime sys_clock_adjtime +306 common syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 common setns sys_setns +309 common getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev +312 common kcmp sys_kcmp +313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp +318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf +322 64 execveat stub_execveat + +# +# x32-specific system call numbers start at 512 to avoid cache impact +# for native 64-bit operation. +# +512 x32 rt_sigaction compat_sys_rt_sigaction +513 x32 rt_sigreturn stub_x32_rt_sigreturn +514 x32 ioctl compat_sys_ioctl +515 x32 readv compat_sys_readv +516 x32 writev compat_sys_writev +517 x32 recvfrom compat_sys_recvfrom +518 x32 sendmsg compat_sys_sendmsg +519 x32 recvmsg compat_sys_recvmsg +520 x32 execve stub_x32_execve +521 x32 ptrace compat_sys_ptrace +522 x32 rt_sigpending compat_sys_rt_sigpending +523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo +525 x32 sigaltstack compat_sys_sigaltstack +526 x32 timer_create compat_sys_timer_create +527 x32 mq_notify compat_sys_mq_notify +528 x32 kexec_load compat_sys_kexec_load +529 x32 waitid compat_sys_waitid +530 x32 set_robust_list compat_sys_set_robust_list +531 x32 get_robust_list compat_sys_get_robust_list +532 x32 vmsplice compat_sys_vmsplice +533 x32 move_pages compat_sys_move_pages +534 x32 preadv compat_sys_preadv64 +535 x32 pwritev compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg compat_sys_recvmmsg +538 x32 sendmmsg compat_sys_sendmmsg +539 x32 process_vm_readv compat_sys_process_vm_readv +540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt +543 x32 io_setup compat_sys_io_setup +544 x32 io_submit compat_sys_io_submit +545 x32 execveat stub_x32_execveat diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh new file mode 100644 index 0000000..31fd5f1 --- /dev/null +++ b/arch/x86/entry/syscalls/syscallhdr.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +in="$1" +out="$2" +my_abis=`echo "($3)" | tr ',' '|'` +prefix="$4" +offset="$5" + +fileguard=_ASM_X86_`basename "$out" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( + echo "#ifndef ${fileguard}" + echo "#define ${fileguard} 1" + echo "" + + while read nr abi name entry ; do + if [ -z "$offset" ]; then + echo "#define __NR_${prefix}${name} $nr" + else + echo "#define __NR_${prefix}${name} ($offset + $nr)" + fi + done + + echo "" + echo "#endif /* ${fileguard} */" +) > "$out" diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh new file mode 100644 index 0000000..0e7f8ec --- /dev/null +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +in="$1" +out="$2" + +grep '^[0-9]' "$in" | sort -n | ( + while read nr abi name entry compat; do + abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` + if [ -n "$compat" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $compat)" + elif [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $entry)" + fi + done +) > "$out" -- cgit v1.1 From 00398a0018d1334fedabfeaabd0fa563121de612 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:41:06 +0200 Subject: x86/asm/entry: Move the vsyscall code to arch/x86/entry/vsyscall/ The vsyscall code is entry code too, so move it to arch/x86/entry/vsyscall/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 6 +- arch/x86/entry/syscall_32.c | 33 +++ arch/x86/entry/syscall_64.c | 32 +++ arch/x86/entry/vsyscall/Makefile | 7 + arch/x86/entry/vsyscall/vsyscall_64.c | 335 ++++++++++++++++++++++++++++++ arch/x86/entry/vsyscall/vsyscall_emu_64.S | 37 ++++ arch/x86/entry/vsyscall/vsyscall_gtod.c | 70 +++++++ arch/x86/entry/vsyscall/vsyscall_trace.h | 29 +++ 8 files changed, 547 insertions(+), 2 deletions(-) create mode 100644 arch/x86/entry/syscall_32.c create mode 100644 arch/x86/entry/syscall_64.c create mode 100644 arch/x86/entry/vsyscall/Makefile create mode 100644 arch/x86/entry/vsyscall/vsyscall_64.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_emu_64.S create mode 100644 arch/x86/entry/vsyscall/vsyscall_gtod.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_trace.h (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9df72c8..b93cce1 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,10 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o thunk_$(BITS).o +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o + obj-y += vdso/ +obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c new file mode 100644 index 0000000..3777189 --- /dev/null +++ b/arch/x86/entry/syscall_32.c @@ -0,0 +1,33 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c new file mode 100644 index 0000000..4ac730b --- /dev/null +++ b/arch/x86/entry/syscall_64.c @@ -0,0 +1,32 @@ +/* System call table for x86-64. */ + +#include +#include +#include +#include +#include + +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif + +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 + +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, + +extern void sys_ni_syscall(void); + +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 0000000..a9f4856 --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c new file mode 100644 index 0000000..2dcc6ff --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2012-2014 Andy Lutomirski + * + * Based on the original implementation which is: + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: + * + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. + * + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. + * + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) +{ + if (!show_unhandled_signals) + return; + + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; +} + +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_nr = X86_TRAP_PF; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr, syscall_nr, tmp; + int prev_sig_on_uaccess_error; + long ret; + + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ + + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; + } + + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); + goto sigsegv; + } + + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); + goto sigsegv; + } + + tsk = current; + + /* + * Check for access_ok violations and find the syscall nr. + * + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, NULL means "don't write anything" not "write it at + * address 0". + */ + switch (vsyscall_nr) { + case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + regs->orig_ax = -1; + if (tmp) + goto do_ret; /* skip requested */ + + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + NULL); + break; + } + + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + +check_fault: + if (ret == -EFAULT) { + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ + } + + regs->ax = ret; + +do_ret: + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; + return true; + +sigsegv: + force_sig(SIGSEGV, current); + return true; +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S new file mode 100644 index 0000000..c9596a9 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -0,0 +1,37 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include + +#include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret + + .balign 4096, 0xcc + + .size __vsyscall_page, 4096 diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c new file mode 100644 index 0000000..51e3304 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include +#include +#include + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->tkr_mono.shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h new file mode 100644 index 0000000..9dd7359 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ +#define TRACE_INCLUDE_FILE vsyscall_trace +#include -- cgit v1.1 From 5ca6f70f387b4f82903037cc3c5488e2c97dcdbc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Jun 2015 13:24:29 -0700 Subject: x86/asm/entry/64: Remove pointless jump to irq_return INTERRUPT_RETURN turns into a jmp instruction. There's no need for extra indirection. Signed-off-by: Andy Lutomirski Cc: Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2f2318653dbad284a59311f13f08cea71298fd7c.1433449436.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f7380ea..3177066 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -663,8 +663,6 @@ retint_kernel: restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 - -irq_return: INTERRUPT_RETURN ENTRY(native_iret) @@ -1432,7 +1430,7 @@ nmi_restore: /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) - jmp irq_return + INTERRUPT_RETURN END(nmi) ENTRY(ignore_sysret) -- cgit v1.1 From 61b1e3e782d6784b714c0d80de529e0737d0e79c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 2 Jun 2015 19:35:10 +0200 Subject: x86/asm/entry/32: Simplify the zeroing of pt_regs->r8..r11 in the int80 code path 32-bit syscall entry points do not save the complete pt_regs struct, they leave some fields uninitialized. However, they must be careful to not leak uninitialized data in pt_regs->r8..r11 to ptrace users. CLEAR_RREGS macro is used to zero these fields out when needed. However, in the int80 code path this zeroing is unconditional. This patch simplifies it by storing zeroes there right away, when pt_regs is constructed on stack. This uses shorter instructions: text data bss dec hex filename 1423 0 0 1423 58f ia32entry.o.before 1407 0 0 1407 57f ia32entry.o Compile-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433266510-2938-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index f167674..f00a409 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -421,6 +421,10 @@ ia32_badarg: movq $-EFAULT,%rax jmp ia32_sysret +ia32_ret_from_sys_call: + CLEAR_RREGS + jmp int_ret_from_sys_call + /* * Emulated IA32 system calls via int 0x80. * @@ -462,8 +466,12 @@ ENTRY(ia32_syscall) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r11 */ cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -481,13 +489,10 @@ ia32_do_call: ia32_sysret: movq %rax,RAX(%rsp) 1: -ia32_ret_from_sys_call: - CLEAR_RREGS jmp int_ret_from_sys_call ia32_tracesys: SAVE_EXTRA_REGS - CLEAR_RREGS movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ -- cgit v1.1 From ef0cd5dc25404594f832dad9133abae52e3b2fa3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 2 Jun 2015 21:04:01 +0200 Subject: x86/asm/entry/32: Open-code CLEAR_RREGS This macro is small, has only four callsites, and one of them is slightly different using a conditional parameter. A few saved lines aren't worth the resulting obfuscation. Generated machine code is identical. Signed-off-by: Denys Vlasenko [ Added comments. ] Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433271842-9139-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index f00a409..8a45d2c 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -29,15 +29,6 @@ .section .entry.text, "ax" - /* clobbers %rax */ - .macro CLEAR_RREGS _r9=rax - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) - .endm - /* * Reload arg registers from stack in case ptrace changed them. * We don't reload %eax because syscall_trace_enter() returned @@ -243,7 +234,11 @@ sysexit_from_sys_call: TRACE_IRQS_OFF testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz \exit - CLEAR_RREGS + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) jmp int_with_check .endm @@ -267,7 +262,11 @@ sysenter_tracesys: jz sysenter_auditsys #endif SAVE_EXTRA_REGS - CLEAR_RREGS + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ @@ -407,7 +406,11 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_EXTRA_REGS - CLEAR_RREGS r9 + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %r9, R9(%rsp) + movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ @@ -422,7 +425,11 @@ ia32_badarg: jmp ia32_sysret ia32_ret_from_sys_call: - CLEAR_RREGS + xorl %eax, %eax /* do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) jmp int_ret_from_sys_call /* -- cgit v1.1 From 73cbf687914fd5f4ef88a42a55784fd28b7450cf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 2 Jun 2015 21:04:02 +0200 Subject: x86/asm/entry/32: Open-code LOAD_ARGS32 This macro is small, has only three callsites, and one of them is slightly different using a conditional parameter. A few saved lines aren't worth the resulting obfuscation. Generated machine code is identical. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433271842-9139-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 54 +++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 8a45d2c..56f819e 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -29,28 +29,6 @@ .section .entry.text, "ax" - /* - * Reload arg registers from stack in case ptrace changed them. - * We don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. Instead, we just truncate that - * value to 32 bits again as we did on entry from user mode. - * If it's a new value set by user_regset during entry tracing, - * this matches the normal truncation of the user-mode value. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - .macro LOAD_ARGS32 _r9=0 - .if \_r9 - movl R9(%rsp),%r9d - .endif - movl RCX(%rsp),%ecx - movl RDX(%rsp),%edx - movl RSI(%rsp),%esi - movl RDI(%rsp),%edi - movl %eax,%eax /* zero extension */ - .endm - - #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret32) swapgs @@ -269,7 +247,14 @@ sysenter_tracesys: movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS jmp sysenter_do_call ENDPROC(ia32_sysenter_target) @@ -413,7 +398,15 @@ cstar_tracesys: movq %rax, R8(%rsp) movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + movl R9(%rsp),%r9d + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS xchgl %ebp,%r9d jmp cstar_do_call @@ -502,7 +495,18 @@ ia32_tracesys: SAVE_EXTRA_REGS movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + /* + * Reload arg registers from stack in case ptrace changed them. + * Don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. But do truncate it to 32 bits. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp ia32_do_call END(ia32_syscall) -- cgit v1.1 From 53e9accf0f7682d717c7b578b6e01fd297ba6630 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 3 Jun 2015 14:56:09 +0200 Subject: x86/asm/entry/32: Do not use R9 in SYSCALL32 entry point SYSENTER and SYSCALL 32-bit entry points differ in handling of arg2 and arg6. SYSENTER: * ecx arg2 * ebp user stack * 0(%ebp) arg6 SYSCALL: * ebp arg2 * esp user stack * 0(%esp) arg6 Sysenter code loads 0(%ebp) to %ebp right away. (This destroys %ebp. It means we do not preserve it on return. It's not causing problems since userspace VDSO code does not depend on it, and SYSENTER insn can't be sanely used outside of VDSO). Syscall code loads 0(%ebp) to %r9. This allows to eliminate one MOV insn (r9 is a register where arg6 should be for 64-bit ABI), but on audit/ptrace code paths this requires juggling of r9 and ebp: (1) ptrace expects arg6 to be in pt_regs->bp; (2) r9 is callee-clobbered register and needs to be saved/restored around calls to C functions. This patch changes syscall code to load 0(%ebp) to %ebp, making it more similar to sysenter code. It's a bit smaller: text data bss dec hex filename 1407 0 0 1407 57f ia32entry.o.before 1391 0 0 1391 56f ia32entry.o To preserve ABI compat, we restore ebp on exit. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433336169-18964-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 56f819e..6321915 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -323,7 +323,7 @@ ENTRY(ia32_cstar_target) * 32bit zero extended */ ASM_STAC -1: movl (%r8),%r9d +1: movl (%r8),%ebp _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -333,7 +333,7 @@ ENTRY(ia32_cstar_target) cstar_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ + movl %ebp,%r9d /* arg6 */ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ movl %ebx,%edi /* arg1 */ movl %edx,%edx /* arg3 (zero extension) */ @@ -349,6 +349,7 @@ cstar_dispatch: jnz sysretl_audit sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RCX(%rsp), %ebp RESTORE_RSI_RDI_RDX movl RIP(%rsp),%ecx movl EFLAGS(%rsp),%r11d @@ -375,9 +376,8 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: - movl %r9d,R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common - movl R9(%rsp),%r9d /* reload 6th syscall arg */ + movl %ebp, %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -389,16 +389,14 @@ cstar_tracesys: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif - xchgl %r9d,%ebp SAVE_EXTRA_REGS xorl %eax, %eax /* do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) - movq %r9, R9(%rsp) + movq %rax, R9(%rsp) movq %rax, R8(%rsp) - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - movl R9(%rsp),%r9d + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter /* Reload arg registers from stack. (see sysenter_tracesys) */ movl RCX(%rsp), %ecx @@ -408,8 +406,7 @@ cstar_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS - xchgl %ebp,%r9d - jmp cstar_do_call + jmp cstar_do_call END(ia32_cstar_target) ia32_badarg: -- cgit v1.1 From 54ad726c51b7f7dffcc1dc379bedadee19f742f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2015 13:02:28 +0200 Subject: x86/asm/entry/32: Improve code readability Make the 64-bit compat 32-bit syscall entry code a bit more readable: - eliminate whitespace noise - use consistent vertical spacing - use consistent assembly coding style similar to entry_64.S - fix various comments No code changed: arch/x86/entry/ia32entry.o: text data bss dec hex filename 1391 0 0 1391 56f ia32entry.o.before 1391 0 0 1391 56f ia32entry.o.after md5: f28501dcc366e68b557313942c6496d6 ia32entry.o.before.asm f28501dcc366e68b557313942c6496d6 ia32entry.o.after.asm Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 287 +++++++++++++++++++++++---------------------- 1 file changed, 146 insertions(+), 141 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 6321915..4bb9f7b 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -1,15 +1,14 @@ /* - * Compatibility mode system call entry point for x86-64. - * + * Compatibility mode system call entry point for x86-64. + * * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - + */ #include "calling.h" #include #include #include -#include -#include +#include +#include #include #include #include @@ -20,11 +19,11 @@ /* Avoid __ASSEMBLER__'ifying just for this. */ #include #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 +#define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit ia32_ret_from_sys_call -#define sysretl_audit ia32_ret_from_sys_call +# define sysexit_audit ia32_ret_from_sys_call +# define sysretl_audit ia32_ret_from_sys_call #endif .section .entry.text, "ax" @@ -37,7 +36,7 @@ ENDPROC(native_usergs_sysret32) #endif /* - * 32bit SYSENTER instruction entry. + * 32-bit SYSENTER instruction entry. * * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. * IF and VM in rflags are cleared (IOW: interrupts are off). @@ -79,7 +78,7 @@ ENTRY(ia32_sysenter_target) pushq %rbp /* pt_regs->sp */ pushfq /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -87,15 +86,15 @@ ENTRY(ia32_sysenter_target) pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ /* * no need to do an access_ok check here because rbp has been - * 32bit zero extended + * 32-bit zero extended */ ASM_STAC -1: movl (%rbp),%ebp - _ASM_EXTABLE(1b,ia32_badarg) +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC /* @@ -103,26 +102,26 @@ ENTRY(ia32_sysenter_target) * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. */ - testl $X86_EFLAGS_NT,EFLAGS(%rsp) - jnz sysenter_fix_flags + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz sysenter_fix_flags sysenter_flags_fixed: orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys + jnz sysenter_tracesys sysenter_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax + cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -139,21 +138,21 @@ sysexit_from_sys_call: * This code path is still called 'sysexit' because it pairs * with 'sysenter' and it uses the SYSENTER calling convention. */ - andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp),%ecx /* User %eip */ + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp), %ecx /* User %eip */ RESTORE_RSI_RDI - xorl %edx,%edx /* avoid info leaks */ - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - movl EFLAGS(%rsp),%r11d /* User eflags */ + xorl %edx, %edx /* Do not leak kernel information */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movl EFLAGS(%rsp), %r11d /* User eflags */ TRACE_IRQS_ON /* * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, * since it avoids a dicey window with interrupts enabled. */ - movl RSP(%rsp),%esp + movl RSP(%rsp), %esp /* * USERGS_SYSRET32 does: @@ -179,51 +178,51 @@ sysexit_from_sys_call: #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common - movl %esi,%r8d /* 5th arg: 4th syscall arg */ - movl %ecx,%r9d /*swap with edx*/ - movl %edx,%ecx /* 4th arg: 3rd syscall arg */ - movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ - movl %ebx,%esi /* 2nd arg: 1st syscall arg */ - movl %eax,%edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp),%eax /* reload syscall number */ - movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI(%rsp),%r8d /* reload 5th syscall arg */ + movl %esi, %r8d /* 5th arg: 4th syscall arg */ + movl %ecx, %r9d /* swap with edx */ + movl %edx, %ecx /* 4th arg: 3rd syscall arg */ + movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ + movl %ebx, %esi /* 2nd arg: 1st syscall arg */ + movl %eax, %edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp), %eax /* reload syscall number */ + movl %ebx, %edi /* reload 1st syscall arg */ + movl RCX(%rsp), %esi /* reload 2nd syscall arg */ + movl RDX(%rsp), %edx /* reload 3rd syscall arg */ + movl RSI(%rsp), %ecx /* reload 4th syscall arg */ + movl RDI(%rsp), %r8d /* reload 5th syscall arg */ .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp),%rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + movl %eax, %esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al, %edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp), %rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* do not leak kernel information */ + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - jmp int_with_check + jmp int_with_check .endm sysenter_auditsys: auditsys_entry_common - movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch sysexit_audit: auditsys_exit sysexit_from_sys_call @@ -232,7 +231,7 @@ sysexit_audit: sysenter_fix_flags: pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) popfq - jmp sysenter_flags_fixed + jmp sysenter_flags_fixed sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL @@ -240,12 +239,12 @@ sysenter_tracesys: jz sysenter_auditsys #endif SAVE_EXTRA_REGS - xorl %eax, %eax /* do not leak kernel information */ + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - movq %rsp,%rdi /* &pt_regs -> arg1 */ + movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter /* Reload arg registers from stack. (see sysenter_tracesys) */ @@ -253,23 +252,23 @@ sysenter_tracesys: movl RDX(%rsp), %edx movl RSI(%rsp), %esi movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp sysenter_do_call ENDPROC(ia32_sysenter_target) /* - * 32bit SYSCALL instruction entry. + * 32-bit SYSCALL instruction entry. * - * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC * are not needed). SYSCALL does not save anything on the stack * and does not change rsp. * * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). * Don't get confused: rflags saving+masking depends on Long Mode Active bit * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). @@ -296,12 +295,12 @@ ENTRY(ia32_cstar_target) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK - movl %esp,%r8d - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax + movl %eax, %eax /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ @@ -314,55 +313,58 @@ ENTRY(ia32_cstar_target) pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rbp /* pt_regs->cx */ - movl %ebp,%ecx + movl %ebp, %ecx pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ /* - * no need to do an access_ok check here because r8 has been - * 32bit zero extended + * No need to do an access_ok check here because r8 has been + * 32-bit zero extended: */ ASM_STAC -1: movl (%r8),%ebp - _ASM_EXTABLE(1b,ia32_badarg) +1: movl (%r8), %ebp + _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz cstar_tracesys cstar_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cstar_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax + cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit + sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RCX(%rsp), %ebp RESTORE_RSI_RDI_RDX - movl RIP(%rsp),%ecx - movl EFLAGS(%rsp),%r11d - xorq %r10,%r10 - xorq %r9,%r9 - xorq %r8,%r8 + movl RIP(%rsp), %ecx + movl EFLAGS(%rsp), %r11d + xorq %r10, %r10 + xorq %r9, %r9 + xorq %r8, %r8 TRACE_IRQS_ON - movl RSP(%rsp),%esp + movl RSP(%rsp), %esp /* - * 64bit->32bit SYSRET restores eip from ecx, + * 64-bit->32-bit SYSRET restores eip from ecx, * eflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. - * (Note: 32bit->32bit SYSRET is different: since r11 + * (Note: 32-bit->32-bit SYSRET is different: since r11 * does not exist, it merely sets eflags.IF=1). * * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss @@ -377,8 +379,8 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp cstar_dispatch sysretl_audit: auditsys_exit sysretl_from_sys_call @@ -386,16 +388,16 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys #endif SAVE_EXTRA_REGS - xorl %eax, %eax /* do not leak kernel information */ + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ + movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter /* Reload arg registers from stack. (see sysenter_tracesys) */ @@ -403,24 +405,24 @@ cstar_tracesys: movl RDX(%rsp), %edx movl RSI(%rsp), %esi movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp cstar_do_call END(ia32_cstar_target) - + ia32_badarg: ASM_CLAC - movq $-EFAULT,%rax - jmp ia32_sysret + movq $-EFAULT, %rax + jmp ia32_sysret ia32_ret_from_sys_call: - xorl %eax, %eax /* do not leak kernel information */ + xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call /* * Emulated IA32 system calls via int 0x80. @@ -454,7 +456,7 @@ ENTRY(ia32_syscall) ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax + movl %eax, %eax /* Construct struct pt_regs on stack (iret frame is already on stack) */ pushq %rax /* pt_regs->orig_ax */ @@ -468,30 +470,33 @@ ENTRY(ia32_syscall) pushq $0 /* pt_regs->r10 */ pushq $0 /* pt_regs->r11 */ cld - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys ia32_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1),%rax + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative + + call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ + ia32_sysret: - movq %rax,RAX(%rsp) + movq %rax, RAX(%rsp) 1: - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call ia32_tracesys: SAVE_EXTRA_REGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter /* * Reload arg registers from stack in case ptrace changed them. * Don't reload %eax because syscall_trace_enter() returned @@ -503,33 +508,33 @@ ia32_tracesys: movl RDX(%rsp), %edx movl RSI(%rsp), %esi movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ + movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS - jmp ia32_do_call + jmp ia32_do_call END(ia32_syscall) .macro PTREGSCALL label, func ALIGN GLOBAL(\label) - leaq \func(%rip),%rax - jmp ia32_ptregs_common + leaq \func(%rip), %rax + jmp ia32_ptregs_common .endm - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork ALIGN GLOBAL(stub32_clone) - leaq sys_clone(%rip),%rax + leaq sys_clone(%rip), %rax mov %r8, %rcx - jmp ia32_ptregs_common + jmp ia32_ptregs_common ALIGN ia32_ptregs_common: SAVE_EXTRA_REGS 8 - call *%rax + call *%rax RESTORE_EXTRA_REGS 8 ret END(ia32_ptregs_common) -- cgit v1.1 From 5cdc683b7d8b3341a3d18e0c5498bc1e4f3fb990 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 3 Jun 2015 15:58:49 +0200 Subject: x86/asm/entry/32: Explain the stub32_clone logic The reason for copying of %r8 to %rcx is quite non-obvious. Add a comment which explains why it is done. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433339930-20880-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index 4bb9f7b..d0c7b28 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -528,6 +528,14 @@ GLOBAL(\label) ALIGN GLOBAL(stub32_clone) leaq sys_clone(%rip), %rax + /* + * 32-bit clone API is clone(..., int tls_val, int *child_tidptr). + * 64-bit clone API is clone(..., int *child_tidptr, int tls_val). + * Native 64-bit kernel's sys_clone() implements the latter. + * We need to swap args here. But since tls_val is in fact ignored + * by sys_clone(), we can get away with an assignment + * (arg4 = arg5) instead of a full swap: + */ mov %r8, %rcx jmp ia32_ptregs_common -- cgit v1.1 From 7a5a9824c18f93415944c997dc6bb8eecfddd2e7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 3 Jun 2015 15:58:50 +0200 Subject: x86/asm/entry/32: Remove unnecessary optimization in stub32_clone Really swap arguments #4 and #5 in stub32_clone instead of "optimizing" it into a move. Yes, tls_val is currently unused. Yes, on some CPUs XCHG is a little bit more expensive than MOV. But a cycle or two on an expensive syscall like clone() is way below noise floor, and this optimization is simply not worth the obfuscation of logic. [ There's also ongoing work on the clone() ABI by Josh Triplett that will depend on this change later on. ] Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Josh Triplett Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433339930-20880-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/ia32entry.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S index d0c7b28..9558dac 100644 --- a/arch/x86/entry/ia32entry.S +++ b/arch/x86/entry/ia32entry.S @@ -529,14 +529,13 @@ GLOBAL(\label) GLOBAL(stub32_clone) leaq sys_clone(%rip), %rax /* - * 32-bit clone API is clone(..., int tls_val, int *child_tidptr). - * 64-bit clone API is clone(..., int *child_tidptr, int tls_val). - * Native 64-bit kernel's sys_clone() implements the latter. - * We need to swap args here. But since tls_val is in fact ignored - * by sys_clone(), we can get away with an assignment - * (arg4 = arg5) instead of a full swap: + * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). + * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). + * + * The native 64-bit kernel's sys_clone() implements the latter, + * so we need to swap arguments here before calling it: */ - mov %r8, %rcx + xchg %r8, %rcx jmp ia32_ptregs_common ALIGN -- cgit v1.1 From 138bd56a2164c65f68042ba580d88ab70c036fd1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2015 14:11:49 +0200 Subject: x86/asm/entry/64/compat: Rename ia32entry.S -> entry_64_compat.S So we now have the following system entry code related files, which define the following system call instruction and other entry paths: entry_32.S # 32-bit binaries on 32-bit kernels entry_64.S # 64-bit binaries on 64-bit kernels entry_64_compat.S # 32-bit binaries on 64-bit kernels Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Josh Triplett Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 2 +- arch/x86/entry/entry_64.S | 2 +- arch/x86/entry/entry_64_compat.S | 547 +++++++++++++++++++++++++++++++++++++++ arch/x86/entry/ia32entry.S | 547 --------------------------------------- 4 files changed, 549 insertions(+), 549 deletions(-) create mode 100644 arch/x86/entry/entry_64_compat.S delete mode 100644 arch/x86/entry/ia32entry.S (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index b93cce1..7a14497 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -6,5 +6,5 @@ obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += vdso/ obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o +obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3177066..4cf3dd3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -516,7 +516,7 @@ ENTRY(ret_from_fork) /* * By the time we get here, we have no idea whether our pt_regs, * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. + * the slow path, or one of the 32-bit compat paths. * Use IRET code path to return, since it can safely handle * all of the above. */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S new file mode 100644 index 0000000..9558dac --- /dev/null +++ b/arch/x86/entry/entry_64_compat.S @@ -0,0 +1,547 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ +#include "calling.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +# define sysexit_audit ia32_ret_from_sys_call +# define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32-bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32-bit zero extended + */ + ASM_STAC +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp), %ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx, %edx /* Do not leak kernel information */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movl EFLAGS(%rsp), %r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp), %esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi, %r8d /* 5th arg: 4th syscall arg */ + movl %ecx, %r9d /* swap with edx */ + movl %edx, %ecx /* 4th arg: 3rd syscall arg */ + movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ + movl %ebx, %esi /* 2nd arg: 1st syscall arg */ + movl %eax, %edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp), %eax /* reload syscall number */ + movl %ebx, %edi /* reload 1st syscall arg */ + movl RCX(%rsp), %esi /* reload 2nd syscall arg */ + movl RDX(%rsp), %edx /* reload 3rd syscall arg */ + movl RSI(%rsp), %ecx /* reload 4th syscall arg */ + movl RDI(%rsp), %r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax, %esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al, %edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp), %rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(ia32_sysenter_target) + +/* + * 32-bit SYSCALL instruction entry. + * + * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp, %ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * No need to do an access_ok check here because r8 has been + * 32-bit zero extended: + */ + ASM_STAC +1: movl (%r8), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit + +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RCX(%rsp), %ebp + RESTORE_RSI_RDI_RDX + movl RIP(%rsp), %ecx + movl EFLAGS(%rsp), %r11d + xorq %r10, %r10 + xorq %r9, %r9 + xorq %r8, %r8 + TRACE_IRQS_ON + movl RSP(%rsp), %esp + /* + * 64-bit->32-bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32-bit->32-bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT, %rax + jmp ia32_sysret + +ia32_ret_from_sys_call: + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_ret_from_sys_call + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r11 */ + cld + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys + +ia32_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ + +ia32_sysret: + movq %rax, RAX(%rsp) +1: + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * Don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. But do truncate it to 32 bits. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(ia32_syscall) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip), %rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip), %rax + /* + * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). + * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). + * + * The native 64-bit kernel's sys_clone() implements the latter, + * so we need to swap arguments here before calling it: + */ + xchg %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S deleted file mode 100644 index 9558dac..0000000 --- a/arch/x86/entry/ia32entry.S +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ -#include "calling.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call -# define sysretl_audit ia32_ret_from_sys_call -#endif - - .section .entry.text, "ax" - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - -/* - * 32-bit SYSENTER instruction entry. - * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). - * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp user stack - * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp - movl %eax, %eax - - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - - /* - * no need to do an access_ok check here because rbp has been - * 32-bit zero extended - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - - /* - * Sysenter doesn't filter flags, so we need to clear NT - * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT, EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp), %ecx /* User %eip */ - RESTORE_RSI_RDI - xorl %edx, %edx /* Do not leak kernel information */ - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 - movl EFLAGS(%rsp), %r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp), %esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - movl %esi, %r8d /* 5th arg: 4th syscall arg */ - movl %ecx, %r9d /* swap with edx */ - movl %edx, %ecx /* 4th arg: 3rd syscall arg */ - movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ - movl %ebx, %esi /* 2nd arg: 1st syscall arg */ - movl %eax, %edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp), %eax /* reload syscall number */ - movl %ebx, %edi /* reload 1st syscall arg */ - movl RCX(%rsp), %esi /* reload 2nd syscall arg */ - movl RDX(%rsp), %edx /* reload 3rd syscall arg */ - movl RSI(%rsp), %ecx /* reload 4th syscall arg */ - movl RDI(%rsp), %r8d /* reload 5th syscall arg */ - .endm - - .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax, %esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al, %edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp), %rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_with_check - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - -sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) - -/* - * 32-bit SYSCALL instruction entry. - * - * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit - * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes - * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). - * - * Arguments: - * eax system call number - * ecx return address - * ebx arg1 - * ebp arg2 (note: not saved in the stack frame, should not be touched) - * edx arg3 - * esi arg4 - * edi arg5 - * esp user stack - * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movl %esp, %r8d - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %r8 /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp, %ecx - pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - - /* - * No need to do an access_ok check here because r8 has been - * 32-bit zero extended: - */ - ASM_STAC -1: movl (%r8), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit - -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RCX(%rsp), %ebp - RESTORE_RSI_RDI_RDX - movl RIP(%rsp), %ecx - movl EFLAGS(%rsp), %r11d - xorq %r10, %r10 - xorq %r9, %r9 - xorq %r8, %r8 - TRACE_IRQS_ON - movl RSP(%rsp), %esp - /* - * 64-bit->32-bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32-bit->32-bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - ASM_CLAC - movq $-EFAULT, %rax - jmp ia32_sysret - -ia32_ret_from_sys_call: - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - - /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 */ - pushq $0 /* pt_regs->r9 */ - pushq $0 /* pt_regs->r10 */ - pushq $0 /* pt_regs->r11 */ - cld - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys - -ia32_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ - -ia32_sysret: - movq %rax, RAX(%rsp) -1: - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - /* - * Reload arg registers from stack in case ptrace changed them. - * Don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. But do truncate it to 32 bits. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - RESTORE_EXTRA_REGS - jmp ia32_do_call -END(ia32_syscall) - - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip), %rax - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - - ALIGN -GLOBAL(stub32_clone) - leaq sys_clone(%rip), %rax - /* - * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). - * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). - * - * The native 64-bit kernel's sys_clone() implements the latter, - * so we need to swap arguments here before calling it: - */ - xchg %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) -- cgit v1.1 From 2cd23553b488589f287457b7396470f5e3c40698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:28:07 +0200 Subject: x86/asm/entry: Rename compat syscall entry points Rename the following system call entry points: ia32_cstar_target -> entry_SYSCALL_compat ia32_syscall -> entry_INT80_compat The generic naming scheme for x86 system call entry points is: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 8 ++++---- arch/x86/entry/syscall_32.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 9558dac..8058892 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -288,7 +288,7 @@ ENDPROC(ia32_sysenter_target) * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. */ -ENTRY(ia32_cstar_target) +ENTRY(entry_SYSCALL_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -409,7 +409,7 @@ cstar_tracesys: RESTORE_EXTRA_REGS jmp cstar_do_call -END(ia32_cstar_target) +END(entry_SYSCALL_compat) ia32_badarg: ASM_CLAC @@ -445,7 +445,7 @@ ia32_ret_from_sys_call: * Assumes it is only called from user space and entered with interrupts off. */ -ENTRY(ia32_syscall) +ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -511,7 +511,7 @@ ia32_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp ia32_do_call -END(ia32_syscall) +END(entry_INT80_compat) .macro PTREGSCALL label, func ALIGN diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 3777189..e398d03 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max +#define __NR_entry_INT80_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_entry_INT80_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, + [0 ... __NR_entry_INT80_compat_max] = &sys_ni_syscall, #include }; -- cgit v1.1 From 4c8cd0c50d0b1559727bf0ec7ff27caeba2dfe09 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:33:56 +0200 Subject: x86/asm/entry: Untangle 'ia32_sysenter_target' into two entry points: entry_SYSENTER_32 and entry_SYSENTER_compat So the SYSENTER instruction is pretty quirky and it has different behavior depending on bitness and CPU maker. Yet we create a false sense of coherency by naming it 'ia32_sysenter_target' in both of the cases. Split the name into its two uses: ia32_sysenter_target (32) -> entry_SYSENTER_32 ia32_sysenter_target (64) -> entry_SYSENTER_compat As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 10 +++++----- arch/x86/entry/entry_64_compat.S | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0ac73de..a65f46c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -307,7 +307,7 @@ END(resume_kernel) the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(ia32_sysenter_target) +ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* @@ -412,7 +412,7 @@ sysexit_audit: .popsection _ASM_EXTABLE(1b,2b) PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) +ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(system_call) @@ -1135,7 +1135,7 @@ END(page_fault) ENTRY(debug) ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) + cmpl $entry_SYSENTER_32,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: @@ -1165,7 +1165,7 @@ ENTRY(nmi) popl %eax je nmi_espfix_stack #endif - cmpl $ia32_sysenter_target,(%esp) + cmpl $entry_SYSENTER_32,(%esp) je nmi_stack_fixup pushl %eax movl %esp,%eax @@ -1176,7 +1176,7 @@ ENTRY(nmi) cmpl $(THREAD_SIZE-20),%eax popl %eax jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) + cmpl $entry_SYSENTER_32,12(%esp) je nmi_debug_stack_check nmi_stack_correct: pushl %eax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 8058892..59840e3 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,7 +57,7 @@ ENDPROC(native_usergs_sysret32) * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. */ -ENTRY(ia32_sysenter_target) +ENTRY(entry_SYSENTER_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -256,7 +256,7 @@ sysenter_tracesys: RESTORE_EXTRA_REGS jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) +ENDPROC(entry_SYSENTER_compat) /* * 32-bit SYSCALL instruction entry. -- cgit v1.1 From b2502b418e63fcde0fe1857732a476b5aa3789b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:42:03 +0200 Subject: x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32 The 'system_call' entry points differ starkly between native 32-bit and 64-bit kernels: on 32-bit kernels it defines the INT 0x80 entry point, while on 64-bit it's the SYSCALL entry point. This is pretty confusing when looking at generic code, and it also obscures the nature of the entry point at the assembly level. So unangle this by splitting the name into its two uses: system_call (32) -> entry_INT80_32 system_call (64) -> entry_SYSCALL_64 As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a65f46c..d594610 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -415,7 +415,7 @@ sysexit_audit: ENDPROC(entry_SYSENTER_32) # system call handler stub -ENTRY(system_call) +ENTRY(entry_INT80_32) ASM_CLAC pushl %eax # save orig_eax SAVE_ALL @@ -508,7 +508,7 @@ ldt_ss: lss (%esp), %esp /* switch to espfix segment */ jmp restore_nocheck #endif -ENDPROC(system_call) +ENDPROC(entry_INT80_32) # perform work that needs to be done immediately before resumption ALIGN diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4cf3dd3..e1852c4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -137,7 +137,7 @@ ENDPROC(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ -ENTRY(system_call) +ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -149,7 +149,7 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -GLOBAL(system_call_after_swapgs) +GLOBAL(entry_SYSCALL_64_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp @@ -182,7 +182,7 @@ GLOBAL(system_call_after_swapgs) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys -system_call_fastpath: +entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax #else @@ -246,7 +246,7 @@ tracesys: jnz tracesys_phase2 /* if needed, run the slow path */ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ + jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ tracesys_phase2: SAVE_EXTRA_REGS @@ -411,7 +411,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret -END(system_call) +END(entry_SYSCALL_64) .macro FORK_LIKE func -- cgit v1.1 From a49976d14f780942dafafbbf16f891c27d385ea0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 09:49:11 +0200 Subject: x86/asm/entry/32: Clean up entry_32.S Make the 32-bit syscall entry code a bit more readable: - use consistent assembly coding style similar to entry_64.S - remove old comments that are not true anymore - eliminate whitespace noise - use consistent vertical spacing - fix various comments No code changed: # arch/x86/entry/entry_32.o: text data bss dec hex filename 6025 0 0 6025 1789 entry_32.o.before 6025 0 0 6025 1789 entry_32.o.after md5: f3fa16b2b0dca804f052deb6b30ba6cb entry_32.o.before.asm f3fa16b2b0dca804f052deb6b30ba6cb entry_32.o.after.asm Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 1141 ++++++++++++++++++++++----------------------- 1 file changed, 570 insertions(+), 571 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d594610..edd7aad 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1,23 +1,12 @@ /* + * Copyright (C) 1991,1992 Linus Torvalds * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. + * entry_32.S contains the system-call and low-level fault and trap handling routines. * * Stack layout in 'syscall_exit': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace needs to have all registers on the stack. + * If the order here is changed, it needs to be + * updated in fork.c:copy_process(), signal.c:do_signal(), * ptrace.c and ptrace.h * * 0(%esp) - %ebx @@ -37,8 +26,6 @@ * 38(%esp) - %eflags * 3C(%esp) - %oldesp * 40(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. */ #include @@ -61,11 +48,11 @@ /* Avoid __ASSEMBLER__'ifying just for this. */ #include #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 +#define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -#define sysenter_audit syscall_trace_entry -#define sysexit_audit syscall_exit_work +# define sysenter_audit syscall_trace_entry +# define sysexit_audit syscall_exit_work #endif .section .entry.text, "ax" @@ -84,16 +71,16 @@ */ #ifdef CONFIG_PREEMPT -#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else -#define preempt_stop(clobbers) -#define resume_kernel restore_all +# define preempt_stop(clobbers) +# define resume_kernel restore_all #endif .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? - jz 1f + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? + jz 1f TRACE_IRQS_ON 1: #endif @@ -112,10 +99,10 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl $0 + pushl $0 .endm .macro POP_GS pop=0 - addl $(4 + \pop), %esp + addl $(4 + \pop), %esp .endm .macro POP_GS_EX .endm @@ -135,119 +122,119 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl %gs + pushl %gs .endm .macro POP_GS pop=0 -98: popl %gs +98: popl %gs .if \pop <> 0 add $\pop, %esp .endif .endm .macro POP_GS_EX .pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b +99: movl $0, (%esp) + jmp 98b .popsection - _ASM_EXTABLE(98b,99b) + _ASM_EXTABLE(98b, 99b) .endm .macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs +98: mov PT_GS(%esp), %gs .endm .macro PTGS_TO_GS_EX .pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b +99: movl $0, PT_GS(%esp) + jmp 98b .popsection - _ASM_EXTABLE(98b,99b) + _ASM_EXTABLE(98b, 99b) .endm .macro GS_TO_REG reg - movl %gs, \reg + movl %gs, \reg .endm .macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) + movl \reg, PT_GS(%esp) .endm .macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs .endm -#endif /* CONFIG_X86_32_LAZY_GS */ +#endif /* CONFIG_X86_32_LAZY_GS */ .macro SAVE_ALL cld PUSH_GS - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - movl $(__USER_DS), %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs SET_KERNEL_GS %edx .endm .macro RESTORE_INT_REGS - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl %ds -2: popl %es -3: popl %fs +1: popl %ds +2: popl %es +3: popl %fs POP_GS \pop .pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b .popsection - _ASM_EXTABLE(1b,4b) - _ASM_EXTABLE(2b,5b) - _ASM_EXTABLE(3b,6b) + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 5b) + _ASM_EXTABLE(3b, 6b) POP_GS_EX .endm ENTRY(ret_from_fork) - pushl %eax - call schedule_tail + pushl %eax + call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags + popl %eax + pushl $0x0202 # Reset kernel eflags popfl - jmp syscall_exit + jmp syscall_exit END(ret_from_fork) ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail + pushl %eax + call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags + popl %eax + pushl $0x0202 # Reset kernel eflags popfl - movl PT_EBP(%esp),%eax - call *PT_EBX(%esp) - movl $0,PT_EAX(%esp) - jmp syscall_exit + movl PT_EBP(%esp), %eax + call *PT_EBX(%esp) + movl $0, PT_EAX(%esp) + jmp syscall_exit ENDPROC(ret_from_kernel_thread) /* @@ -264,62 +251,65 @@ ret_from_exception: ret_from_intr: GET_THREAD_INFO(%ebp) #ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax #else /* * We can be coming here from child spawned by kernel_thread(). */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax #endif - cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) need_resched: - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all - call preempt_schedule_irq - jmp need_resched + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched END(resume_kernel) #endif -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ +/* + * SYSENTER_RETURN points to after the SYSENTER instruction + * in the vsyscall page. See vsyscall-sysentry.S, which defines + * the symbol. + */ - # sysenter call handler stub + # SYSENTER call handler stub ENTRY(entry_SYSENTER_32) - movl TSS_sysenter_sp0(%esp),%esp + movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: /* * Interrupts are disabled here, but we can't trace it until * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl $__USER_DS - pushl %ebp + pushl $__USER_DS + pushl %ebp pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS + orl $X86_EFLAGS_IF, (%esp) + pushl $__USER_CS /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary: TI_sysenter_return @@ -328,9 +318,9 @@ sysenter_past_esp: * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; * and THREAD_SIZE takes us to the bottom. */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - pushl %eax + pushl %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -338,132 +328,134 @@ sysenter_past_esp: * Load the potential sixth argument from user stack. * Careful about security. */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault + cmpl $__PAGE_OFFSET-3, %ebp + jae syscall_fault ASM_STAC -1: movl (%ebp),%ebp +1: movl (%ebp), %ebp ASM_CLAC - movl %ebp,PT_EBP(%esp) - _ASM_EXTABLE(1b,syscall_fault) + movl %ebp, PT_EBP(%esp) + _ASM_EXTABLE(1b, syscall_fault) GET_THREAD_INFO(%ebp) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz sysenter_audit + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz sysenter_audit sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(,%eax,4) + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(, %eax, 4) sysenter_after_call: - movl %eax,PT_EAX(%esp) + movl %eax, PT_EAX(%esp) LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jnz sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp, %ebp TRACE_IRQS_ON -1: mov PT_FS(%esp), %fs +1: mov PT_FS(%esp), %fs PTGS_TO_GS ENABLE_INTERRUPTS_SYSEXIT #ifdef CONFIG_AUDITSYSCALL sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl PT_ESI(%esp) /* a3: 5th arg */ - pushl PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl %ecx /* get that remapped edx off the stack */ - popl %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp),%eax /* reload syscall number */ - jmp sysenter_do_call + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp) + jnz syscall_trace_entry + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ + call __audit_syscall_entry + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ + movl PT_EAX(%esp), %eax /* reload syscall number */ + jmp sysenter_do_call sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit + movl %eax, %edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al, %eax /* zero-extend that */ + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp),%eax /* reload syscall return value */ - jmp sysenter_exit + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + movl PT_EAX(%esp), %eax /* reload syscall return value */ + jmp sysenter_exit #endif -.pushsection .fixup,"ax" -2: movl $0,PT_FS(%esp) - jmp 1b +.pushsection .fixup, "ax" +2: movl $0, PT_FS(%esp) + jmp 1b .popsection - _ASM_EXTABLE(1b,2b) + _ASM_EXTABLE(1b, 2b) PTGS_TO_GS_EX ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(entry_INT80_32) ASM_CLAC - pushl %eax # save orig_eax + pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys syscall_call: - call *sys_call_table(,%eax,4) + call *sys_call_table(, %eax, 4) syscall_after_call: - movl %eax,PT_EAX(%esp) # store the return value + movl %eax, PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jnz syscall_exit_work restore_all: TRACE_IRQS_IRET restore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: PT_OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + je ldt_ss # returning to user-space with LDT SS #endif restore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code + RESTORE_REGS 4 # skip orig_eax/error_code irq_return: INTERRUPT_RETURN -.section .fixup,"ax" -ENTRY(iret_exc) - pushl $0 # no error code - pushl $do_iret_error - jmp error_code +.section .fixup, "ax" +ENTRY(iret_exc ) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code .previous - _ASM_EXTABLE(irq_return,iret_exc) + _ASM_EXTABLE(irq_return, iret_exc) #ifdef CONFIG_X86_ESPFIX32 ldt_ss: @@ -476,8 +468,8 @@ ldt_ss: * is still available to implement the setting of the high * 16-bits in the INTERRUPT_RETURN paravirt-op. */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck #endif /* @@ -492,21 +484,23 @@ ldt_ss: * a base address that matches for the difference. */ #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ - /* Disable interrupts, but do not irqtrace this section: we + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to - * the irqstate after the iret */ + * the irqstate after the IRET: + */ DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck + lss (%esp), %esp /* switch to espfix segment */ + jmp restore_nocheck #endif ENDPROC(entry_INT80_32) @@ -514,93 +508,93 @@ ENDPROC(entry_INT80_32) ALIGN work_pending: testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig + jz work_notifysig work_resched: - call schedule + call schedule LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all testb $_TIF_NEED_RESCHED, %cl - jnz work_resched + jnz work_resched -work_notifysig: # deal with pending signals and - # notify-resume requests +work_notifysig: # deal with pending signals and + # notify-resume requests #ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jnz work_notifysig_v86 # returning to kernel-space or + # vm86-space 1: #else - movl %esp, %eax + movl %esp, %eax #endif TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl + movb PT_CS(%esp), %bl andb $SEGMENT_RPL_MASK, %bl cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace #ifdef CONFIG_VM86 ALIGN work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + jmp 1b #endif END(work_pending) # perform syscall exit tracing ALIGN syscall_trace_entry: - movl $-ENOSYS,PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter + movl $-ENOSYS, PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead - movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace END(syscall_exit_work) syscall_fault: ASM_CLAC GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace + movl $-EFAULT, PT_EAX(%esp) + jmp resume_userspace END(syscall_fault) syscall_badsys: - movl $-ENOSYS,%eax - jmp syscall_after_call + movl $-ENOSYS, %eax + jmp syscall_after_call END(syscall_badsys) sysenter_badsys: - movl $-ENOSYS,%eax - jmp sysenter_after_call + movl $-ENOSYS, %eax + jmp sysenter_after_call END(sysenter_badsys) .macro FIXUP_ESPFIX_STACK @@ -613,24 +607,24 @@ END(sysenter_badsys) */ #ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax - addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - pushl %eax - lss (%esp), %esp /* switch to the normal stack segment */ + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ #endif .endm .macro UNWIND_ESPFIX_STACK #ifdef CONFIG_X86_ESPFIX32 - movl %ss, %eax + movl %ss, %eax /* see if on espfix stack */ - cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es /* switch to normal stack */ FIXUP_ESPFIX_STACK 27: @@ -645,7 +639,7 @@ END(sysenter_badsys) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ + pushl $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt .align 8 @@ -659,35 +653,34 @@ END(irq_entries_start) .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: ASM_CLAC - addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL TRACE_IRQS_OFF - movl %esp,%eax - call do_IRQ - jmp ret_from_intr + movl %esp, %eax + call do_IRQ + jmp ret_from_intr ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ ASM_CLAC; \ - pushl $~(nr); \ + pushl $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ - movl %esp,%eax; \ - call fn; \ - jmp ret_from_intr; \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ ENDPROC(name) #ifdef CONFIG_TRACING -#define TRACE_BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) #else -#define TRACE_BUILD_INTERRUPT(name, nr) +# define TRACE_BUILD_INTERRUPT(name, nr) #endif -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ @@ -695,30 +688,30 @@ ENDPROC(name) ENTRY(coprocessor_error) ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp error_code + pushl $0 + pushl $do_coprocessor_error + jmp error_code END(coprocessor_error) ENTRY(simd_coprocessor_error) ASM_CLAC - pushl $0 + pushl $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ X86_FEATURE_XMM #else - pushl $do_simd_coprocessor_error + pushl $do_simd_coprocessor_error #endif - jmp error_code + jmp error_code END(simd_coprocessor_error) ENTRY(device_not_available) ASM_CLAC - pushl $-1 # mark this as an int - pushl $do_device_not_available - jmp error_code + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp error_code END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -735,165 +728,171 @@ END(native_irq_enable_sysexit) ENTRY(overflow) ASM_CLAC - pushl $0 - pushl $do_overflow - jmp error_code + pushl $0 + pushl $do_overflow + jmp error_code END(overflow) ENTRY(bounds) ASM_CLAC - pushl $0 - pushl $do_bounds - jmp error_code + pushl $0 + pushl $do_bounds + jmp error_code END(bounds) ENTRY(invalid_op) ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp error_code + pushl $0 + pushl $do_invalid_op + jmp error_code END(invalid_op) ENTRY(coprocessor_segment_overrun) ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp error_code + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code END(coprocessor_segment_overrun) ENTRY(invalid_TSS) ASM_CLAC - pushl $do_invalid_TSS - jmp error_code + pushl $do_invalid_TSS + jmp error_code END(invalid_TSS) ENTRY(segment_not_present) ASM_CLAC - pushl $do_segment_not_present - jmp error_code + pushl $do_segment_not_present + jmp error_code END(segment_not_present) ENTRY(stack_segment) ASM_CLAC - pushl $do_stack_segment - jmp error_code + pushl $do_stack_segment + jmp error_code END(stack_segment) ENTRY(alignment_check) ASM_CLAC - pushl $do_alignment_check - jmp error_code + pushl $do_alignment_check + jmp error_code END(alignment_check) ENTRY(divide_error) ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp error_code + pushl $0 # no error code + pushl $do_divide_error + jmp error_code END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) ASM_CLAC - pushl $0 - pushl machine_check_vector - jmp error_code + pushl $0 + pushl machine_check_vector + jmp error_code END(machine_check) #endif ENTRY(spurious_interrupt_bug) ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp error_code + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code END(spurious_interrupt_bug) #ifdef CONFIG_XEN -/* Xen doesn't set %esp to be precisely what the normal sysenter - entrypoint expects, so fix it up before using the normal path. */ +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ + pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL TRACE_IRQS_OFF - /* Check to see if we got the event in the critical - region in xen_iret_direct, after we've reenabled - events and checked for pending events. This simulates - iret instruction's behaviour where it delivers a - pending interrupt when enabling interrupts. */ - movl PT_EIP(%esp),%eax - cmpl $xen_iret_start_crit,%eax - jb 1f - cmpl $xen_iret_end_crit,%eax - jae 1f + /* + * Check to see if we got the event in the critical + * region in xen_iret_direct, after we've reenabled + * events and checked for pending events. This simulates + * iret instruction's behaviour where it delivers a + * pending interrupt when enabling interrupts: + */ + movl PT_EIP(%esp), %eax + cmpl $xen_iret_start_crit, %eax + jb 1f + cmpl $xen_iret_end_crit, %eax + jae 1f - jmp xen_iret_crit_fixup + jmp xen_iret_crit_fixup ENTRY(xen_do_upcall) -1: mov %esp, %eax - call xen_evtchn_do_upcall +1: mov %esp, %eax + call xen_evtchn_do_upcall #ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall + call xen_maybe_preempt_hcall #endif - jmp ret_from_intr + jmp ret_from_intr ENDPROC(xen_hypervisor_callback) -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we fix up by reattempting the load, and zeroing the segment -# register if the load fails. -# Category 2 we fix up by jumping to do_iret_error. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by maintaining a status value in EAX. +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we fix up by reattempting the load, and zeroing the segment + * register if the load fails. + * Category 2 we fix up by jumping to do_iret_error. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by maintaining a status value in EAX. + */ ENTRY(xen_failsafe_callback) - pushl %eax - movl $1,%eax -1: mov 4(%esp),%ds -2: mov 8(%esp),%es -3: mov 12(%esp),%fs -4: mov 16(%esp),%gs + pushl %eax + movl $1, %eax +1: mov 4(%esp), %ds +2: mov 8(%esp), %es +3: mov 12(%esp), %fs +4: mov 16(%esp), %gs /* EAX == 0 => Category 1 (Bad segment) EAX != 0 => Category 2 (Bad IRET) */ - testl %eax,%eax - popl %eax - lea 16(%esp),%esp - jz 5f - jmp iret_exc -5: pushl $-1 /* orig_ax = -1 => not a system call */ + testl %eax, %eax + popl %eax + lea 16(%esp), %esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL - jmp ret_from_exception - -.section .fixup,"ax" -6: xorl %eax,%eax - movl %eax,4(%esp) - jmp 1b -7: xorl %eax,%eax - movl %eax,8(%esp) - jmp 2b -8: xorl %eax,%eax - movl %eax,12(%esp) - jmp 3b -9: xorl %eax,%eax - movl %eax,16(%esp) - jmp 4b + jmp ret_from_exception + +.section .fixup, "ax" +6: xorl %eax, %eax + movl %eax, 4(%esp) + jmp 1b +7: xorl %eax, %eax + movl %eax, 8(%esp) + jmp 2b +8: xorl %eax, %eax + movl %eax, 12(%esp) + jmp 3b +9: xorl %eax, %eax + movl %eax, 16(%esp) + jmp 4b .previous - _ASM_EXTABLE(1b,6b) - _ASM_EXTABLE(2b,7b) - _ASM_EXTABLE(3b,8b) - _ASM_EXTABLE(4b,9b) + _ASM_EXTABLE(1b, 6b) + _ASM_EXTABLE(2b, 7b) + _ASM_EXTABLE(3b, 8b) + _ASM_EXTABLE(4b, 9b) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, xen_evtchn_do_upcall) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) @@ -910,28 +909,28 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call ftrace_call: - call ftrace_stub + call ftrace_stub - addl $4,%esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax ftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: - jmp ftrace_stub + jmp ftrace_stub #endif .globl ftrace_stub @@ -949,72 +948,72 @@ ENTRY(ftrace_regs_caller) * as the current return ip is. We move the return ip into the * ip location, and move flags into the return ip location. */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS,13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS, 13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret popf - jmp ftrace_stub + jmp ftrace_stub #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ - cmpl $ftrace_stub, ftrace_trace_function - jnz trace + cmpl $ftrace_stub, ftrace_trace_function + jnz trace #ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: @@ -1022,92 +1021,92 @@ ftrace_stub: /* taken from glibc */ trace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax ret END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx #endif #ifdef CONFIG_TRACING ENTRY(trace_page_fault) ASM_CLAC - pushl $trace_do_page_fault - jmp error_code + pushl $trace_do_page_fault + jmp error_code END(trace_page_fault) #endif ENTRY(page_fault) ASM_CLAC - pushl $do_page_fault + pushl $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs UNWIND_ESPFIX_STACK GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception + movl %esp, %eax # pt_regs pointer + call *%edi + jmp ret_from_exception END(page_fault) /* @@ -1124,28 +1123,28 @@ END(page_fault) * the instruction that would have done it for sysenter. */ .macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok + cmpw $__KERNEL_CS, 4(%esp) + jne \ok \label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp + movl TSS_sysenter_sp0 + \offset(%esp), %esp pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp + pushl $__KERNEL_CS + pushl $sysenter_past_esp .endm ENTRY(debug) ASM_CLAC - cmpl $entry_SYSENTER_32,(%esp) - jne debug_stack_correct + cmpl $entry_SYSENTER_32, (%esp) + jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception + xorl %edx, %edx # error code 0 + movl %esp, %eax # pt_regs pointer + call do_debug + jmp ret_from_exception END(debug) /* @@ -1159,91 +1158,91 @@ END(debug) ENTRY(nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - je nmi_espfix_stack + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_espfix_stack #endif - cmpl $entry_SYSENTER_32,(%esp) - je nmi_stack_fixup - pushl %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, + cmpl $entry_SYSENTER_32, (%esp) + je nmi_stack_fixup + pushl %eax + movl %esp, %eax + /* + * Do not access memory above the end of our stack page, * it might not exist. */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - jae nmi_stack_correct - cmpl $entry_SYSENTER_32,12(%esp) - je nmi_debug_stack_check + andl $(THREAD_SIZE-1), %eax + cmpl $(THREAD_SIZE-20), %eax + popl %eax + jae nmi_stack_correct + cmpl $entry_SYSENTER_32, 12(%esp) + je nmi_debug_stack_check nmi_stack_correct: - pushl %eax + pushl %eax SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_all_notrace + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace nmi_stack_fixup: FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct + jmp nmi_stack_correct nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct + cmpw $__KERNEL_CS, 16(%esp) + jne nmi_stack_correct + cmpl $debug, (%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn, (%esp) + ja nmi_stack_correct FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct + jmp nmi_stack_correct #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: /* * create the pointer to lss back */ - pushl %ss - pushl %esp - addl $4, (%esp) + pushl %ss + pushl %esp + addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl 16(%esp) + pushl 16(%esp) .endr - pushl %eax + pushl %eax SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx, %edx # zero error code + call do_nmi RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return + lss 12+4(%esp), %esp # back to espfix stack + jmp irq_return #endif END(nmi) ENTRY(int3) ASM_CLAC - pushl $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_int3 + jmp ret_from_exception END(int3) ENTRY(general_protection) - pushl $do_general_protection - jmp error_code + pushl $do_general_protection + jmp error_code END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) ASM_CLAC - pushl $do_async_page_fault - jmp error_code + pushl $do_async_page_fault + jmp error_code END(async_page_fault) #endif - -- cgit v1.1 From 4d7321381e5c7102a3d3faf0a0a0035a09619612 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 20:43:07 +0200 Subject: x86/asm/entry/64: Clean up entry_64.S Make the 64-bit syscall entry code a bit more readable: - use consistent assembly coding style similar to the other entry_*.S files - remove old comments that are not true anymore - eliminate whitespace noise - use consistent vertical spacing - fix various comments - reorganize entry point generation tables to be more readable No code changed: # arch/x86/entry/entry_64.o: text data bss dec hex filename 12282 0 0 12282 2ffa entry_64.o.before 12282 0 0 12282 2ffa entry_64.o.after md5: cbab1f2d727a2a8a87618eeb79f391b7 entry_64.o.before.asm cbab1f2d727a2a8a87618eeb79f391b7 entry_64.o.after.asm Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 820 +++++++++++++++++++++++----------------------- 1 file changed, 404 insertions(+), 416 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a0ed2..bd97161 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -4,26 +4,20 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek - */ - -/* + * * entry.S contains the system-call and fault low-level handling routines. * * Some of this is documented in Documentation/x86/entry_64.txt * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * * A note on terminology: - * - iret frame: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * * Some macro usage: - * - ENTRY/END Define functions in the symbol table. - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - idtentry - Define exception entry points. + * - ENTRY/END: Define functions in the symbol table. + * - TRACE_IRQ_*: Trace hardirq state for lock debugging. + * - idtentry: Define exception entry points. */ - #include #include #include @@ -46,13 +40,12 @@ /* Avoid __ASSEMBLER__'ifying just for this. */ #include -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - - .code64 - .section .entry.text, "ax" +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 +.code64 +.section .entry.text, "ax" #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) @@ -61,11 +54,10 @@ ENTRY(native_usergs_sysret64) ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ - .macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON 1: #endif @@ -85,34 +77,34 @@ ENDPROC(native_usergs_sysret64) #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) .macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_OFF - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_ON - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_IRETQ_DEBUG - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON_DEBUG 1: .endm #else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif /* - * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. + * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC * are not needed). SYSCALL does not save anything on the stack @@ -128,7 +120,7 @@ ENDPROC(native_usergs_sysret64) * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) + * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) * * Only called from user space. * @@ -151,12 +143,12 @@ ENTRY(entry_SYSCALL_64) */ GLOBAL(entry_SYSCALL_64_after_swapgs) - movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + movq %rsp, PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* * Re-enable interrupts. * We use 'rsp_scratch' as a scratch space, hence irq-off block above @@ -165,34 +157,34 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz tracesys entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* * Syscall return path ending with SYSRET (fast path). @@ -213,15 +205,15 @@ entry_SYSCALL_64_fastpath: * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is * very bad. */ - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp),%rcx - movq EFLAGS(%rsp),%r11 - movq RSP(%rsp),%rsp + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + movq RSP(%rsp), %rsp /* - * 64bit SYSRET restores rip from rcx, + * 64-bit SYSRET restores rip from rcx, * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. @@ -239,21 +231,21 @@ entry_SYSCALL_64_fastpath: /* Do syscall entry tracing */ tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax + jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ tracesys_phase2: SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax,%rdx - call syscall_trace_enter_phase2 + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + movq %rax, %rdx + call syscall_trace_enter_phase2 /* * Reload registers from stack in case ptrace changed them. @@ -263,15 +255,15 @@ tracesys_phase2: RESTORE_C_REGS_EXCEPT_RAX RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx /* fixup for C */ + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* Use IRET because user could have changed pt_regs->foo */ @@ -283,31 +275,33 @@ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK,%edi + movl $_TIF_ALLWORK_MASK, %edi /* edi: mask to check */ GLOBAL(int_with_check) LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz int_careful + andl $~TS_COMPAT, TI_status(%rcx) jmp syscall_return - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ + /* + * Either reschedule or signal or syscall exit tracking needed. + * First do a reschedule test. + * edx: work, edi: workmask + */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + bt $TIF_NEED_RESCHED, %edx + jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi + pushq %rdi SCHEDULE_USER - popq %rdi + popq %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: @@ -315,27 +309,27 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT,%edx - jz int_signal - pushq %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi - jmp int_restore_rest + testl $_TIF_WORK_SYSCALL_EXIT, %edx + jz int_signal + pushq %rdi + leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ + call syscall_trace_leave + popq %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi + jmp int_restore_rest int_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_WORK_MASK,%edi + testl $_TIF_DO_NOTIFY_MASK, %edx + jz 1f + movq %rsp, %rdi /* &ptregs -> arg1 */ + xorl %esi, %esi /* oldset -> arg2 */ + call do_notify_resume +1: movl $_TIF_WORK_MASK, %edi int_restore_rest: RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check syscall_return: /* The IRETQ could re-enable interrupts: */ @@ -346,10 +340,10 @@ syscall_return: * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ - movq RCX(%rsp),%rcx - movq RIP(%rsp),%r11 - cmpq %rcx,%r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + movq RCX(%rsp), %rcx + movq RIP(%rsp), %r11 + cmpq %rcx, %r11 /* RCX == RIP */ + jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -362,19 +356,21 @@ syscall_return: .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif + /* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 jne opportunistic_sysret_failed - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + movq R11(%rsp), %r11 + cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed /* * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, @@ -383,29 +379,29 @@ syscall_return: * with register state that satisfies the opportunistic SYSRET * conditions. For example, single-stepping this user code: * - * movq $stuck_here,%rcx + * movq $stuck_here, %rcx * pushfq * popq %r11 * stuck_here: * * would never get past 'stuck_here'. */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed /* nothing to check for RSP */ - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp),%rsp + movq RSP(%rsp), %rsp USERGS_SYSRET64 opportunistic_sysret_failed: @@ -417,7 +413,7 @@ END(entry_SYSCALL_64) .macro FORK_LIKE func ENTRY(stub_\func) SAVE_EXTRA_REGS 8 - jmp sys_\func + jmp sys_\func END(stub_\func) .endm @@ -436,7 +432,7 @@ return_from_execve: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp ZERO_EXTRA_REGS - movq %rax,RAX(%rsp) + movq %rax, RAX(%rsp) jmp int_ret_from_sys_call END(stub_execve) /* @@ -479,19 +475,19 @@ ENTRY(stub_rt_sigreturn) * we SAVE_EXTRA_REGS here. */ SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn + call sys_rt_sigreturn return_from_stub: addq $8, %rsp RESTORE_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call + movq %rax, RAX(%rsp) + jmp int_ret_from_sys_call END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub + call sys32_x32_rt_sigreturn + jmp return_from_stub END(stub_x32_rt_sigreturn) #endif @@ -502,16 +498,16 @@ END(stub_x32_rt_sigreturn) */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK,TI_flags(%r8) + LOCK ; btr $TIF_FORK, TI_flags(%r8) - pushq $0x0002 - popfq # reset kernel eflags + pushq $0x0002 + popfq /* reset kernel eflags */ - call schedule_tail # rdi: 'prev' task parameter + call schedule_tail /* rdi: 'prev' task parameter */ RESTORE_EXTRA_REGS - testb $3, CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) /* from kernel_thread? */ /* * By the time we get here, we have no idea whether our pt_regs, @@ -522,13 +518,15 @@ ENTRY(ret_from_fork) */ jnz int_ret_from_sys_call - /* We came from kernel_thread */ - /* nb: we depend on RESTORE_EXTRA_REGS above */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) + /* + * We came from kernel_thread + * nb: we depend on RESTORE_EXTRA_REGS above + */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call END(ret_from_fork) /* @@ -539,7 +537,7 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq $(~vector+0x80) /* Note: always in signed byte range */ + pushq $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt .align 8 @@ -569,7 +567,7 @@ END(irq_entries_start) /* this goes to 0(%rsp) for unwinder, not for saving the value: */ SAVE_EXTRA_REGS_RBP -RBP - leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ + leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ testb $3, CS-RBP(%rsp) jz 1f @@ -582,14 +580,14 @@ END(irq_entries_start) * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ - movq %rsp, %rsi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rsi + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rsi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF - call \func + call \func .endm /* @@ -599,36 +597,35 @@ END(irq_entries_start) .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: ASM_CLAC - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ interrupt do_IRQ /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) /* Restore saved previous stack */ - popq %rsi + popq %rsi /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi),%rsp + leaq -RBP(%rsi), %rsp testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ retint_user: GET_THREAD_INFO(%rcx) - /* - * %rcx: thread info. Interrupts off. - */ + + /* %rcx: thread info. Interrupts are off. */ retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK, %edi retint_check: LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz retint_careful + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz retint_careful -retint_swapgs: /* return to user-space */ +retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ @@ -643,9 +640,9 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - bt $9,EFLAGS(%rsp) /* interrupts were off? */ + bt $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f -0: cmpl $0,PER_CPU_VAR(__preempt_count) +0: cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f call preempt_schedule_irq jmp 0b @@ -671,8 +668,8 @@ ENTRY(native_iret) * 64-bit mode SS:RSP on the exception stack is always valid. */ #ifdef CONFIG_X86_ESPFIX64 - testb $4,(SS-RIP)(%rsp) - jnz native_irq_return_ldt + testb $4, (SS-RIP)(%rsp) + jnz native_irq_return_ldt #endif .global native_irq_return_iret @@ -687,59 +684,59 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq %rax - pushq %rdi + pushq %rax + pushq %rdi SWAPGS - movq PER_CPU_VAR(espfix_waddr),%rdi - movq %rax,(0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp),%rax /* RIP */ - movq %rax,(1*8)(%rdi) - movq (3*8)(%rsp),%rax /* CS */ - movq %rax,(2*8)(%rdi) - movq (4*8)(%rsp),%rax /* RFLAGS */ - movq %rax,(3*8)(%rdi) - movq (6*8)(%rsp),%rax /* SS */ - movq %rax,(5*8)(%rdi) - movq (5*8)(%rsp),%rax /* RSP */ - movq %rax,(4*8)(%rdi) - andl $0xffff0000,%eax - popq %rdi - orq PER_CPU_VAR(espfix_stack),%rax + movq PER_CPU_VAR(espfix_waddr), %rdi + movq %rax, (0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (1*8)(%rdi) + movq (3*8)(%rsp), %rax /* CS */ + movq %rax, (2*8)(%rdi) + movq (4*8)(%rsp), %rax /* RFLAGS */ + movq %rax, (3*8)(%rdi) + movq (6*8)(%rsp), %rax /* SS */ + movq %rax, (5*8)(%rdi) + movq (5*8)(%rsp), %rax /* RSP */ + movq %rax, (4*8)(%rdi) + andl $0xffff0000, %eax + popq %rdi + orq PER_CPU_VAR(espfix_stack), %rax SWAPGS - movq %rax,%rsp - popq %rax - jmp native_irq_return_iret + movq %rax, %rsp + popq %rax + jmp native_irq_return_iret #endif /* edi: workmask, edx: work */ retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + bt $TIF_NEED_RESCHED, %edx + jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi + pushq %rdi SCHEDULE_USER - popq %rdi + popq %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp retint_check + jmp retint_check retint_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz retint_swapgs + testl $_TIF_DO_NOTIFY_MASK, %edx + jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume + movq $-1, ORIG_RAX(%rsp) + xorl %esi, %esi /* oldset */ + movq %rsp, %rdi /* &pt_regs */ + call do_notify_resume RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule + jmp retint_with_reschedule END(common_interrupt) @@ -749,10 +746,10 @@ END(common_interrupt) .macro apicinterrupt3 num sym do_sym ENTRY(\sym) ASM_CLAC - pushq $~(\num) + pushq $~(\num) .Lcommon_\sym: interrupt \do_sym - jmp ret_from_intr + jmp ret_from_intr END(\sym) .endm @@ -774,60 +771,45 @@ trace_apicinterrupt \num \sym .endm #ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ - irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR \ - reboot_interrupt smp_reboot_interrupt +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE \ - uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt #endif -apicinterrupt LOCAL_TIMER_VECTOR \ - apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR \ - x86_platform_ipi smp_x86_platform_ipi + +apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR \ - kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \ - kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi +apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt smp_threshold_interrupt +apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt #endif #ifdef CONFIG_X86_MCE_AMD -apicinterrupt DEFERRED_ERROR_VECTOR \ - deferred_error_interrupt smp_deferred_error_interrupt +apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt #endif #ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR \ - thermal_interrupt smp_thermal_interrupt +apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt #endif #ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ - call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR \ - call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR \ - reschedule_interrupt smp_reschedule_interrupt +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif -apicinterrupt ERROR_APIC_VECTOR \ - error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR \ - spurious_interrupt smp_spurious_interrupt +apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt #ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR \ - irq_work_interrupt smp_irq_work_interrupt +apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt #endif /* @@ -846,54 +828,54 @@ ENTRY(\sym) PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code - pushq $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 - testb $3, CS(%rsp) /* If coming from userspace, switch */ - jnz 1f /* stacks. */ + testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ + jnz 1f .endif - call paranoid_entry + call paranoid_entry .else - call error_entry + call error_entry .endif /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ .else TRACE_IRQS_OFF .endif .endif - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif - call \do_sym + call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit + jmp paranoid_exit .else - jmp error_exit + jmp error_exit .endif .if \paranoid == 1 @@ -903,25 +885,25 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ 1: - call error_entry + call error_entry - movq %rsp,%rdi /* pt_regs pointer */ - call sync_regs - movq %rax,%rsp /* switch stack */ + movq %rsp, %rdi /* pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif - call \do_sym + call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit /* %ebx: no swapgs flag */ .endif END(\sym) .endm @@ -937,55 +919,57 @@ idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* Reload gs selector with exception handling */ - /* edi: new selector */ +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* + * Reload gs selector with exception handling + * edi: new selector + */ ENTRY(native_load_gs_index) pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: - movl %edi,%gs -2: mfence /* workaround */ + movl %edi, %gs +2: mfence /* workaround */ SWAPGS popfq ret END(native_load_gs_index) - _ASM_EXTABLE(gs_change,bad_gs) - .section .fixup,"ax" + _ASM_EXTABLE(gs_change, bad_gs) + .section .fixup, "ax" /* running with kernelgs */ bad_gs: - SWAPGS /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b + SWAPGS /* switch back to user gs */ + xorl %eax, %eax + movl %eax, %gs + jmp 2b .previous /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp,%rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr),%rsp - push %rbp # backlink for old unwinder - call __do_softirq + pushq %rbp + mov %rsp, %rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr), %rsp + push %rbp /* frame pointer backlink */ + call __do_softirq leaveq - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) ret END(do_softirq_own_stack) @@ -1005,23 +989,24 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. */ -ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) +ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ + /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ - movq %rdi, %rsp # we don't return, adjust the stack frame -11: incl PER_CPU_VAR(irq_count) - movq %rsp,%rbp - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rbp # backlink for old unwinder - call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + movq %rdi, %rsp /* we don't return, adjust the stack frame */ +11: incl PER_CPU_VAR(irq_count) + movq %rsp, %rbp + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rbp /* frame pointer backlink */ + call xen_evtchn_do_upcall + popq %rsp + decl PER_CPU_VAR(irq_count) #ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall + call xen_maybe_preempt_hcall #endif - jmp error_exit + jmp error_exit END(xen_do_hypervisor_callback) /* @@ -1038,35 +1023,35 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - movl %ds,%ecx - cmpw %cx,0x10(%rsp) - jne 1f - movl %es,%ecx - cmpw %cx,0x18(%rsp) - jne 1f - movl %fs,%ecx - cmpw %cx,0x20(%rsp) - jne 1f - movl %gs,%ecx - cmpw %cx,0x28(%rsp) - jne 1f + movl %ds, %ecx + cmpw %cx, 0x10(%rsp) + jne 1f + movl %es, %ecx + cmpw %cx, 0x18(%rsp) + jne 1f + movl %fs, %ecx + cmpw %cx, 0x20(%rsp) + jne 1f + movl %gs, %ecx + cmpw %cx, 0x28(%rsp) + jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $0 /* RIP */ - pushq %r11 - pushq %rcx - jmp general_protection + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx + jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $-1 /* orig_ax = -1 => not a system call */ + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS - jmp error_exit + jmp error_exit END(xen_failsafe_callback) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1079,21 +1064,25 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 + #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 + +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 + #ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif + #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* @@ -1105,13 +1094,13 @@ ENTRY(paranoid_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx + movl $1, %ebx + movl $MSR_GS_BASE, %ecx rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ + testl %edx, %edx + js 1f /* negative -> in kernel */ SWAPGS - xorl %ebx,%ebx + xorl %ebx, %ebx 1: ret END(paranoid_entry) @@ -1124,16 +1113,17 @@ END(paranoid_entry) * in syscall entry), so checking for preemption here would * be complicated. Fortunately, we there's no good reason * to try to handle preemption here. + * + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + testl %ebx, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore + jmp paranoid_exit_restore paranoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG paranoid_exit_restore: @@ -1151,7 +1141,7 @@ ENTRY(error_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - xorl %ebx,%ebx + xorl %ebx, %ebx testb $3, CS+8(%rsp) jz error_kernelspace error_swapgs: @@ -1167,41 +1157,41 @@ error_sti: * for these here too. */ error_kernelspace: - incl %ebx - leaq native_irq_return_iret(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_bad_iret - movl %ecx,%eax /* zero extend */ - cmpq %rax,RIP+8(%rsp) - je bstep_iret - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti + incl %ebx + leaq native_irq_return_iret(%rip), %rcx + cmpq %rcx, RIP+8(%rsp) + je error_bad_iret + movl %ecx, %eax /* zero extend */ + cmpq %rax, RIP+8(%rsp) + je bstep_iret + cmpq $gs_change, RIP+8(%rsp) + je error_swapgs + jmp error_sti bstep_iret: /* Fix truncated RIP */ - movq %rcx,RIP+8(%rsp) + movq %rcx, RIP+8(%rsp) /* fall through */ error_bad_iret: SWAPGS - mov %rsp,%rdi - call fixup_bad_iret - mov %rax,%rsp - decl %ebx /* Return to usergs */ - jmp error_sti + mov %rsp, %rdi + call fixup_bad_iret + mov %rax, %rsp + decl %ebx /* Return to usergs */ + jmp error_sti END(error_entry) /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) - movl %ebx,%eax + movl %ebx, %eax RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %eax,%eax - jnz retint_kernel - jmp retint_user + testl %eax, %eax + jnz retint_kernel + jmp retint_user END(error_exit) /* Runs on exception stack */ @@ -1240,21 +1230,21 @@ ENTRY(nmi) */ /* Use %rdx as our temp variable throughout */ - pushq %rdx + pushq %rdx /* * If %cs was not the kernel segment, then the NMI triggered in user * space, which means it is definitely not nested. */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi /* * Check the special variable on the stack to see if NMIs are * executing. */ - cmpl $1, -8(%rsp) - je nested_nmi + cmpl $1, -8(%rsp) + je nested_nmi /* * Now test if the previous stack was an NMI stack. @@ -1268,6 +1258,7 @@ ENTRY(nmi) cmpq %rdx, 4*8(%rsp) /* If the stack pointer is above the NMI stack, this is a normal NMI */ ja first_nmi + subq $EXCEPTION_STKSZ, %rdx cmpq %rdx, 4*8(%rsp) /* If it is below the NMI stack, it is a normal NMI */ @@ -1280,29 +1271,29 @@ nested_nmi: * It's about to repeat the NMI handler, so we are fine * with ignoring this one. */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp - leaq -10*8(%rsp), %rdx - pushq $__KERNEL_DS - pushq %rdx + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx pushfq - pushq $__KERNEL_CS - pushq $repeat_nmi + pushq $__KERNEL_CS + pushq $repeat_nmi /* Put stack back */ - addq $(6*8), %rsp + addq $(6*8), %rsp nested_nmi_out: - popq %rdx + popq %rdx /* No need to check faults here */ INTERRUPT_RETURN @@ -1344,19 +1335,17 @@ first_nmi: * is also used by nested NMIs and can not be trusted on exit. */ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ - movq (%rsp), %rdx + movq (%rsp), %rdx /* Set the NMI executing variable on the stack. */ - pushq $1 + pushq $1 - /* - * Leave room for the "copied" frame - */ - subq $(5*8), %rsp + /* Leave room for the "copied" frame */ + subq $(5*8), %rsp /* Copy the stack frame to the Saved frame */ .rept 5 - pushq 11*8(%rsp) + pushq 11*8(%rsp) .endr /* Everything up to here is safe from nested NMIs */ @@ -1376,14 +1365,14 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 10*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ - addq $(10*8), %rsp + addq $(10*8), %rsp .rept 5 - pushq -6*8(%rsp) + pushq -6*8(%rsp) .endr - subq $(5*8), %rsp + subq $(5*8), %rsp end_repeat_nmi: /* @@ -1391,7 +1380,7 @@ end_repeat_nmi: * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. */ - pushq $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK /* @@ -1401,7 +1390,7 @@ end_repeat_nmi: * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call paranoid_entry + call paranoid_entry /* * Save off the CR2 register. If we take a page fault in the NMI then @@ -1412,21 +1401,21 @@ end_repeat_nmi: * origin fault. Save it off and restore it if it changes. * Use the r12 callee-saved register. */ - movq %cr2, %r12 + movq %cr2, %r12 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 1: - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: @@ -1436,12 +1425,11 @@ nmi_restore: REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ - movq $0, 5*8(%rsp) + movq $0, 5*8(%rsp) INTERRUPT_RETURN END(nmi) ENTRY(ignore_sysret) - mov $-ENOSYS,%eax + mov $-ENOSYS, %eax sysret END(ignore_sysret) - -- cgit v1.1 From eb47854415825a69b1c578e7487da571227ba25a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 7 Jun 2015 20:24:30 +0200 Subject: x86/asm/entry/32: Reinstate clearing of pt_regs->r8..r11 on EFAULT path I broke this recently when I changed pt_regs->r8..r11 clearing logic in INT 80 code path. There is a branch from SYSENTER/SYSCALL code to INT 80 code: if we fail to retrieve arg6, we return EFAULT. Before this patch, in this case we don't clear pt_regs->r8..r11. This patch fixes this. The resulting code is smaller and simpler. While at it, remove incorrect comment about syscall dispatching CALL insn: it does not use RIP-relative addressing form (the comment was meant to be "TODO: make this rip-relative", and morphed since then, dropping "TODO"). Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433701470-28800-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 59840e3..5757dde 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -413,9 +413,7 @@ END(entry_SYSCALL_compat) ia32_badarg: ASM_CLAC - movq $-EFAULT, %rax - jmp ia32_sysret - + movq $-EFAULT, RAX(%rsp) ia32_ret_from_sys_call: xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) @@ -486,9 +484,7 @@ ia32_do_call: cmpq $(IA32_NR_syscalls-1), %rax ja 1f - call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ - -ia32_sysret: + call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: jmp int_ret_from_sys_call -- cgit v1.1 From bace7117d3fb59a6ed7ea1aa6c8994df6a28a72a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 21:20:26 +0200 Subject: x86/asm/entry: (Re-)rename __NR_entry_INT80_compat_max to __NR_syscall_compat_max Brian Gerst noticed that I did a weird rename in the following commit: b2502b418e63 ("x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32") which renamed __NR_ia32_syscall_max to __NR_entry_INT80_compat_max. Now the original name was a misnomer, but the new one is a misnomer as well, as all the 32-bit compat syscall entry points (sysenter, syscall) share the system call table, not just the INT80 based one. Rename it to __NR_syscall_compat_max. Reported-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index e398d03..8ea34f9 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_entry_INT80_compat_max __NR_syscall_max +#define __NR_syscall_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_entry_INT80_compat_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_entry_INT80_compat_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, #include }; -- cgit v1.1 From 9b47feb708e50e6114b4b4193eee67f2e5af9d05 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 8 Jun 2015 22:35:33 +0200 Subject: x86/asm/entry: Clean up entry*.S style, final bits A few bits were missed. Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 18 +++++++++--------- arch/x86/entry/entry_64_compat.S | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index edd7aad..21dc60a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -12,7 +12,7 @@ * 0(%esp) - %ebx * 4(%esp) - %ecx * 8(%esp) - %edx - * C(%esp) - %esi + * C(%esp) - %esi * 10(%esp) - %edi * 14(%esp) - %ebp * 18(%esp) - %eax @@ -128,7 +128,7 @@ .macro POP_GS pop=0 98: popl %gs .if \pop <> 0 - add $\pop, %esp + add $\pop, %esp .endif .endm .macro POP_GS_EX @@ -487,8 +487,8 @@ ldt_ss: mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ pushl $__ESPFIX_SS @@ -507,7 +507,7 @@ ENDPROC(entry_INT80_32) # perform work that needs to be done immediately before resumption ALIGN work_pending: - testb $_TIF_NEED_RESCHED, %cl + testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: call schedule @@ -520,7 +520,7 @@ work_resched: andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testb $_TIF_NEED_RESCHED, %cl jnz work_resched work_notifysig: # deal with pending signals and @@ -537,8 +537,8 @@ work_notifysig: # deal with pending signals and TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl jb resume_kernel xorl %edx, %edx call do_notify_resume @@ -609,7 +609,7 @@ END(sysenter_badsys) /* fixup the stack */ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ - shl $16, %eax + shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS pushl %eax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 5757dde..2093ce6 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -106,8 +106,8 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysenter_tracesys sysenter_do_call: @@ -138,7 +138,7 @@ sysexit_from_sys_call: * This code path is still called 'sysexit' because it pairs * with 'sysenter' and it uses the SYSENTER calling convention. */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp), %ecx /* User %eip */ RESTORE_RSI_RDI xorl %edx, %edx /* Do not leak kernel information */ @@ -229,7 +229,7 @@ sysexit_audit: #endif sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) popfq jmp sysenter_flags_fixed @@ -325,9 +325,9 @@ ENTRY(entry_SYSCALL_compat) 1: movl (%r8), %ebp _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys cstar_do_call: /* 32-bit syscall -> 64-bit C ABI argument conversion */ -- cgit v1.1 From aee4b013a71666f11ffeac11ab45bb7c6e0e394d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 9 Jun 2015 20:54:07 +0200 Subject: x86/asm/entry/32: Fix fallout from the R9 trick removal in the SYSCALL code I put %ebp restoration code too late. Under strace, it is not reached and %ebp is not restored upon return to userspace. This is the fix. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433876051-26604-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 2093ce6..2c44180 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -344,6 +344,7 @@ cstar_dispatch: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: + movl RCX(%rsp), %ebp DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -351,7 +352,6 @@ cstar_dispatch: sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RCX(%rsp), %ebp RESTORE_RSI_RDI_RDX movl RIP(%rsp), %ecx movl EFLAGS(%rsp), %r11d -- cgit v1.1 From 1536bb46fac7672ef04aaaa6a3b07848314263bc Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 9 Jun 2015 20:54:08 +0200 Subject: x86/asm/entry/32: Explain reloading of registers after __audit_syscall_entry() Here it is not obvious why we load pt_regs->cx to %esi etc. Lets improve comments. Explain that here we combine two things: first, we reload registers since some of them are clobbered by the C function we just called; and we also convert 32-bit syscall params to 64-bit C ABI, because we are going to jump back to syscall dispatch code. Move reloading of 6th argument into the macro instead of having it after each of two macro invocations. No actual code changes here. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433876051-26604-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 2c44180..0fa108c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -185,12 +185,18 @@ sysexit_from_sys_call: movl %ebx, %esi /* 2nd arg: 1st syscall arg */ movl %eax, %edi /* 1st arg: syscall number */ call __audit_syscall_entry - movl ORIG_RAX(%rsp), %eax /* reload syscall number */ - movl %ebx, %edi /* reload 1st syscall arg */ - movl RCX(%rsp), %esi /* reload 2nd syscall arg */ - movl RDX(%rsp), %edx /* reload 3rd syscall arg */ - movl RSI(%rsp), %ecx /* reload 4th syscall arg */ - movl RDI(%rsp), %r8d /* reload 5th syscall arg */ + /* + * We are going to jump back to syscall dispatch. + * Prepare syscall args as required by 64-bit C ABI. + * Clobbered registers are loaded from pt_regs on stack. + */ + movl ORIG_RAX(%rsp), %eax /* syscall number */ + movl %ebx, %edi /* arg1 */ + movl RCX(%rsp), %esi /* arg2 */ + movl RDX(%rsp), %edx /* arg3 */ + movl RSI(%rsp), %ecx /* arg4 */ + movl RDI(%rsp), %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ .endm .macro auditsys_exit exit @@ -221,7 +227,6 @@ sysexit_from_sys_call: sysenter_auditsys: auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ jmp sysenter_dispatch sysexit_audit: @@ -379,7 +384,6 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: -- cgit v1.1 From a92fde25231a89d7d10895482556260c1b63767d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 9 Jun 2015 20:54:09 +0200 Subject: x86/asm/entry/32: Shorten __audit_syscall_entry() args preparation We use three MOVs to swap edx and ecx. We can use one XCHG instead. Expand the comments. It's difficult to keep track which arg# every register corresponds to, so spell it out. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1433876051-26604-3-git-send-email-dvlasenk@redhat.com [ Expanded the comments some more. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 0fa108c..bb187a6 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -178,17 +178,26 @@ sysexit_from_sys_call: #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common - movl %esi, %r8d /* 5th arg: 4th syscall arg */ - movl %ecx, %r9d /* swap with edx */ - movl %edx, %ecx /* 4th arg: 3rd syscall arg */ - movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ - movl %ebx, %esi /* 2nd arg: 1st syscall arg */ - movl %eax, %edi /* 1st arg: syscall number */ + /* + * At this point, registers hold syscall args in the 32-bit syscall ABI: + * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. + * + * We want to pass them to __audit_syscall_entry(), which is a 64-bit + * C function with 5 parameters, so shuffle them to match what + * the function expects: RDI,RSI,RDX,RCX,R8. + */ + movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ + xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ + /* arg3 (RDX) <= 2nd syscall arg (ECX) */ + movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ + movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ call __audit_syscall_entry + /* - * We are going to jump back to syscall dispatch. - * Prepare syscall args as required by 64-bit C ABI. - * Clobbered registers are loaded from pt_regs on stack. + * We are going to jump back to the syscall dispatch code. + * Prepare syscall args as required by the 64-bit C ABI. + * Registers clobbered by __audit_syscall_entry() are + * loaded from pt_regs on stack: */ movl ORIG_RAX(%rsp), %eax /* syscall number */ movl %ebx, %edi /* arg1 */ -- cgit v1.1 From 539f5113650068ba221197f190267ab727296ef5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Jun 2015 12:36:01 -0700 Subject: x86/asm/entry/64: Disentangle error_entry/exit gsbase/ebx/usermode code The error_entry/error_exit code to handle gsbase and whether we return to user mdoe was a mess: - error_sti was misnamed. In particular, it did not enable interrupts. - Error handling for gs_change was hopelessly tangled the normal usermode path. Separate it out. This saves a branch in normal entries from kernel mode. - The comments were bad. Fix it up. As a nice side effect, there's now a code path that happens on error entries from user mode. We'll use it soon. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f1be898ab93360169fb845ab85185948832209ee.1433878454.git.luto@kernel.org [ Prettified it, clarified comments some more. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index bd97161..3bb2c43 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1135,7 +1135,7 @@ END(paranoid_exit) /* * Save all registers in pt_regs, and switch gs if needed. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) cld @@ -1144,9 +1144,11 @@ ENTRY(error_entry) xorl %ebx, %ebx testb $3, CS+8(%rsp) jz error_kernelspace -error_swapgs: + + /* We entered from user mode */ SWAPGS -error_sti: + +error_entry_done: TRACE_IRQS_OFF ret @@ -1165,8 +1167,15 @@ error_kernelspace: cmpq %rax, RIP+8(%rsp) je bstep_iret cmpq $gs_change, RIP+8(%rsp) - je error_swapgs - jmp error_sti + jne error_entry_done + + /* + * hack: gs_change can fail with user gsbase. If this happens, fix up + * gsbase and proceed. We'll fix up the exception and land in + * gs_change's error handler with kernel gsbase. + */ + SWAPGS + jmp error_entry_done bstep_iret: /* Fix truncated RIP */ @@ -1174,16 +1183,30 @@ bstep_iret: /* fall through */ error_bad_iret: + /* + * We came from an IRET to user mode, so we have user gsbase. + * Switch to kernel gsbase: + */ SWAPGS + + /* + * Pretend that the exception came from user mode: set up pt_regs + * as if we faulted immediately after IRET and clear EBX so that + * error_exit knows that we will be returning to user mode. + */ mov %rsp, %rdi call fixup_bad_iret mov %rax, %rsp - decl %ebx /* Return to usergs */ - jmp error_sti + decl %ebx + jmp error_entry_done END(error_entry) -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +/* + * On entry, EBS is a "return to kernel mode" flag: + * 1: already in kernel mode, don't need SWAPGS + * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode + */ ENTRY(error_exit) movl %ebx, %eax RESTORE_EXTRA_REGS -- cgit v1.1