From e6e5494cb23d1933735ee47cc674ffe1c4afed6f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:53:50 -0700 Subject: [PATCH] vdso: randomize the i386 vDSO by moving it into a vma Move the i386 VDSO down into a vma and thus randomize it. Besides the security implications, this feature also helps debuggers, which can COW a vma-backed VDSO just like a normal DSO and can thus do single-stepping and other debugging features. It's good for hypervisors (Xen, VMWare) too, which typically live in the same high-mapped address space as the VDSO, hence whenever the VDSO is used, they get lots of guest pagefaults and have to fix such guest accesses up - which slows things down instead of speeding things up (the primary purpose of the VDSO). There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support for older glibcs that still rely on a prelinked high-mapped VDSO. Newer distributions (using glibc 2.3.3 or later) can turn this option off. Turning it off is also recommended for security reasons: attackers cannot use the predictable high-mapped VDSO page as syscall trampoline anymore. There is a new vdso=[0|1] boot option as well, and a runtime /proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned on/off. (This version of the VDSO-randomization patch also has working ELF coredumping, the previous patch crashed in the coredumping code.) This code is a combined work of the exec-shield VDSO randomization code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell started this patch and i completed it. [akpm@osdl.org: cleanups] [akpm@osdl.org: compile fix] [akpm@osdl.org: compile fix 2] [akpm@osdl.org: compile fix 3] [akpm@osdl.org: revernt MAXMEM change] Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Cc: Gerd Hoffmann Cc: Rusty Russell Cc: Zachary Amsden Cc: Andi Kleen Cc: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 11 +++ arch/i386/kernel/asm-offsets.c | 4 +- arch/i386/kernel/entry.S | 7 +- arch/i386/kernel/signal.c | 4 +- arch/i386/kernel/sysenter.c | 128 +++++++++++++++++++++++++++++++++-- arch/i386/kernel/vsyscall-sysenter.S | 4 +- arch/i386/kernel/vsyscall.lds.S | 4 +- 7 files changed, 150 insertions(+), 12 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 6662f8c..3bb221d 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -780,6 +780,17 @@ config HOTPLUG_CPU enable suspend on SMP systems. CPUs can be controlled through /sys/devices/system/cpu. +config COMPAT_VDSO + bool "Compat VDSO support" + default y + help + Map the VDSO to the predictable old-style address too. + ---help--- + Say N here if you are running a sufficiently recent glibc + version (2.3.3 or later), to remove the high-mapped + VDSO mapping and to exclusively use the randomized VDSO. + + If unsure, say Y. endmenu diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 1c3a809..c80271f 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -14,6 +14,7 @@ #include #include #include +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -54,6 +55,7 @@ void foo(void) OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); BLANK(); OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); @@ -69,7 +71,7 @@ void foo(void) sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL)); + DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index e8d2630..fbdb933 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -270,7 +270,12 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ - pushl $SYSENTER_RETURN + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 5c352c3..43002cf 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; } - restorer = &__kernel_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up to return from userspace. */ - restorer = &__kernel_rt_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 0bada18..c60419de 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -2,6 +2,8 @@ * linux/arch/i386/kernel/sysenter.c * * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar * * This file contains the needed initializations to support sysenter. */ @@ -13,12 +15,31 @@ #include #include #include +#include +#include #include #include #include #include +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso_enabled = 1; + +EXPORT_SYMBOL_GPL(vdso_enabled); + +static int __init vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + + return 1; +} + +__setup("vdso=", vdso_setup); + extern asmlinkage void sysenter_entry(void); void enable_sep_cpu(void) @@ -45,23 +66,122 @@ void enable_sep_cpu(void) */ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; +static void *syscall_page; int __init sysenter_setup(void) { - void *page = (void *)get_zeroed_page(GFP_ATOMIC); + syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); - __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC); +#ifdef CONFIG_COMPAT_VDSO + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); +#else + /* + * In the non-compat case the ELF coredumping code needs the fixmap: + */ + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO); +#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(page, + memcpy(syscall_page, &vsyscall_int80_start, &vsyscall_int80_end - &vsyscall_int80_start); return 0; } - memcpy(page, + memcpy(syscall_page, &vsyscall_sysenter_start, &vsyscall_sysenter_end - &vsyscall_sysenter_start); return 0; } + +static struct page *syscall_nopage(struct vm_area_struct *vma, + unsigned long adr, int *type) +{ + struct page *p = virt_to_page(adr - vma->vm_start + syscall_page); + get_page(p); + return p; +} + +/* Prevent VMA merging */ +static void syscall_vma_close(struct vm_area_struct *vma) +{ +} + +static struct vm_operations_struct syscall_vm_ops = { + .close = syscall_vma_close, + .nopage = syscall_nopage, +}; + +/* Defined in vsyscall-sysenter.S */ +extern void SYSENTER_RETURN; + +/* Setup a VMA at program startup for the vsyscall page */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + + down_write(&mm->mmap_sem); + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + ret = -ENOMEM; + goto up_fail; + } + + vma->vm_start = addr; + vma->vm_end = addr + PAGE_SIZE; + /* MAYWRITE to allow gdb to COW and set breakpoints */ + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; + vma->vm_flags |= mm->def_flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_ops = &syscall_vm_ops; + vma->vm_mm = mm; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto free_vma; + + current->mm->context.vdso = (void *)addr; + current_thread_info()->sysenter_return = + (void *)VDSO_SYM(&SYSENTER_RETURN); + mm->total_vm++; +up_fail: + up_write(&mm->mmap_sem); + return ret; + +free_vma: + kmem_cache_free(vm_area_cachep, vma); + return ret; +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ + return NULL; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + return 0; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S index 3b62baa..1a36d26 100644 --- a/arch/i386/kernel/vsyscall-sysenter.S +++ b/arch/i386/kernel/vsyscall-sysenter.S @@ -42,10 +42,10 @@ __kernel_vsyscall: /* 7: align return point with nop's to make disassembly easier */ .space 7,0x90 - /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */ + /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel /* 16: System call normal return point is here! */ - .globl SYSENTER_RETURN /* Symbol used by entry.S. */ + .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ SYSENTER_RETURN: pop %ebp .Lpop_ebp: diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S index 98699ca..e26975f 100644 --- a/arch/i386/kernel/vsyscall.lds.S +++ b/arch/i386/kernel/vsyscall.lds.S @@ -7,7 +7,7 @@ SECTIONS { - . = VSYSCALL_BASE + SIZEOF_HEADERS; + . = VDSO_PRELINK + SIZEOF_HEADERS; .hash : { *(.hash) } :text .dynsym : { *(.dynsym) } @@ -20,7 +20,7 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; + . = VDSO_PRELINK + 0x400; .text : { *(.text) } :text =0x90909090 .note : { *(.note.*) } :text :note -- cgit v1.1