diff options
author | Andy Lutomirski <luto@kernel.org> | 2015-04-26 16:47:59 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-26 17:57:38 -0700 |
commit | 61f01dd941ba9e06d2bf05994450ecc3d61b6b8b (patch) | |
tree | 3a4a4ef2d5e4d44bb4cb1708f9fc4749e9c9824a /arch/x86/kernel | |
parent | 1190944f4b12203330ac5ed8784f6c181bf26f2d (diff) | |
download | op-kernel-dev-61f01dd941ba9e06d2bf05994450ecc3d61b6b8b.zip op-kernel-dev-61f01dd941ba9e06d2bf05994450ecc3d61b6b8b.tar.gz |
x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue
AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET with
SS == 0 results in an invalid usermode state in which SS is apparently
equal to __USER_DS but causes #SS if used.
Work around the issue by setting SS to __KERNEL_DS __switch_to, thus
ensuring that SYSRET never happens with SS set to NULL.
This was exposed by a recent vDSO cleanup.
Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 9 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 28 |
3 files changed, 40 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd470eb..e4cf633 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); + + /* AMD CPUs don't reset SS attributes on SYSRET */ + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b2384..02c2eff 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -295,6 +295,15 @@ system_call_fastpath: * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we should + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. (Actually + * detecting the failure in 64-bit userspace is tricky but can be + * done.) */ USERGS_SYSRET64 diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4baaa97..ddfdbf7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { + /* + * AMD CPUs have a misfeature: SYSRET sets the SS selector but + * does not update the cached descriptor. As a result, if we + * do SYSRET while SS is NULL, we'll end up in user mode with + * SS apparently equal to __USER_DS but actually unusable. + * + * The straightforward workaround would be to fix it up just + * before SYSRET, but that would slow down the system call + * fast paths. Instead, we ensure that SS is never NULL in + * system call context. We do this by replacing NULL SS + * selectors at every context switch. SYSCALL sets up a valid + * SS, so the only way to get NULL is to re-enter the kernel + * from CPL 3 through an interrupt. Since that can't happen + * in the same task as a running syscall, we are guaranteed to + * context switch between every interrupt vector entry and a + * subsequent SYSRET. + * + * We read SS first because SS reads are much faster than + * writes. Out of caution, we force SS to __KERNEL_DS even if + * it previously had a different non-NULL value. + */ + unsigned short ss_sel; + savesegment(ss, ss_sel); + if (ss_sel != __KERNEL_DS) + loadsegment(ss, __KERNEL_DS); + } + return prev_p; } |