From 312f06f761f7362e153ed5a1a9c49e17294e52b5 Mon Sep 17 00:00:00 2001 From: gordon Date: Wed, 14 Mar 2018 04:00:00 +0000 Subject: Add mitigations for two classes of speculative execution vulnerabilities on amd64. [FreeBSD-SA-18:03.speculative_execution] Approved by: so Security: FreeBSD-SA-18:03.speculative_execution Security: CVE-2017-5715 Security: CVE-2017-5754 --- UPDATING | 5 + sys/amd64/amd64/apic_vector.S | 172 ++++----- sys/amd64/amd64/atpic_vector.S | 53 ++- sys/amd64/amd64/cpu_switch.S | 13 +- sys/amd64/amd64/db_trace.c | 1 + sys/amd64/amd64/exception.S | 562 ++++++++++++++++++++--------- sys/amd64/amd64/genassym.c | 18 + sys/amd64/amd64/initcpu.c | 1 + sys/amd64/amd64/machdep.c | 144 +++++--- sys/amd64/amd64/mp_machdep.c | 235 ++++++++++-- sys/amd64/amd64/pmap.c | 576 ++++++++++++++++++++++++++++-- sys/amd64/amd64/support.S | 113 ++++++ sys/amd64/amd64/sys_machdep.c | 30 +- sys/amd64/amd64/trap.c | 35 +- sys/amd64/amd64/vm_machdep.c | 2 + sys/amd64/ia32/ia32_exception.S | 17 +- sys/amd64/ia32/ia32_syscall.c | 9 +- sys/amd64/include/asmacros.h | 185 +++++++--- sys/amd64/include/frame.h | 48 ++- sys/amd64/include/intr_machdep.h | 2 +- sys/amd64/include/md_var.h | 8 + sys/amd64/include/pcb.h | 2 +- sys/amd64/include/pcpu.h | 9 +- sys/amd64/include/pmap.h | 12 + sys/amd64/include/smp.h | 28 +- sys/amd64/vmm/intel/vmx.c | 3 +- sys/amd64/vmm/vmm.c | 4 +- sys/conf/Makefile.amd64 | 1 + sys/conf/newvers.sh | 2 +- sys/dev/cpuctl/cpuctl.c | 31 +- sys/dev/hyperv/vmbus/amd64/vmbus_vector.S | 7 +- sys/dev/hyperv/vmbus/i386/vmbus_vector.S | 1 + sys/dev/hyperv/vmbus/vmbus.c | 6 +- sys/i386/i386/apic_vector.s | 4 + sys/i386/i386/atpic_vector.s | 1 + sys/i386/i386/exception.s | 1 + sys/i386/i386/machdep.c | 2 +- sys/i386/i386/pmap.c | 6 +- sys/i386/i386/support.s | 8 + sys/i386/i386/vm_machdep.c | 2 +- sys/sys/cpuctl.h | 1 + sys/x86/include/apicvar.h | 6 +- sys/x86/include/specialreg.h | 25 ++ sys/x86/include/x86_smp.h | 5 +- sys/x86/include/x86_var.h | 11 +- sys/x86/isa/atpic.c | 17 +- sys/x86/x86/cpu_machdep.c | 52 +++ sys/x86/x86/identcpu.c | 110 ++++-- sys/x86/x86/local_apic.c | 40 ++- sys/x86/x86/mp_x86.c | 11 +- sys/x86/xen/pv.c | 3 + usr.sbin/cpucontrol/cpucontrol.8 | 32 +- usr.sbin/cpucontrol/cpucontrol.c | 60 +++- 53 files changed, 2141 insertions(+), 591 deletions(-) diff --git a/UPDATING b/UPDATING index d8eadbe..a754cca 100644 --- a/UPDATING +++ b/UPDATING @@ -16,6 +16,11 @@ from older versions of FreeBSD, try WITHOUT_CLANG and WITH_GCC to bootstrap to the tip of head, and then rebuild without this option. The bootstrap process from older version of current across the gcc/clang cutover is a bit fragile. +20180314 p8 FreeBSD-SA-18:03.speculative_execution + + Add mitigations for two classes of speculative execution vulnerabilities + on amd64. + 20180307 p7 FreeBSD-SA-18:01.ipsec FreeBSD-SA-18:02.ntp FreeBSD-EN-18:01.tzdata diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index b3ca520..ea93d32 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -2,6 +2,12 @@ * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014-2018 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -38,12 +44,12 @@ #include "opt_smp.h" +#include "assym.s" + #include #include #include -#include "assym.s" - #ifdef SMP #define LK lock ; #else @@ -73,30 +79,28 @@ as_lapic_eoi: * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ -#define ISR_VEC(index, vec_name) \ - .text ; \ - SUPERALIGN_TEXT ; \ -IDTVEC(vec_name) ; \ - PUSH_FRAME ; \ - FAKE_MCOUNT(TF_RIP(%rsp)) ; \ - cmpl $0,x2apic_mode ; \ - je 1f ; \ - movl $(MSR_APIC_ISR0 + index),%ecx ; \ - rdmsr ; \ - jmp 2f ; \ -1: ; \ - movq lapic_map, %rdx ; /* pointer to local APIC */ \ - movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \ -2: ; \ - bsrl %eax, %eax ; /* index of highest set bit in ISR */ \ - jz 3f ; \ - addl $(32 * index),%eax ; \ - movq %rsp, %rsi ; \ - movl %eax, %edi ; /* pass the IRQ */ \ - call lapic_handle_intr ; \ -3: ; \ - MEXITCOUNT ; \ + .macro ISR_VEC index, vec_name + INTR_HANDLER \vec_name + FAKE_MCOUNT(TF_RIP(%rsp)) + cmpl $0,x2apic_mode + je 1f + movl $(MSR_APIC_ISR0 + \index),%ecx + rdmsr + jmp 2f +1: + movq lapic_map, %rdx /* pointer to local APIC */ + movl LA_ISR + 16 * (\index)(%rdx), %eax /* load ISR */ +2: + bsrl %eax, %eax /* index of highest set bit in ISR */ + jz 3f + addl $(32 * \index),%eax + movq %rsp, %rsi + movl %eax, %edi /* pass the IRQ */ + call lapic_handle_intr +3: + MEXITCOUNT jmp doreti + .endm /* * Handle "spurious INTerrupts". @@ -108,26 +112,21 @@ IDTVEC(vec_name) ; \ .text SUPERALIGN_TEXT IDTVEC(spuriousint) - /* No EOI cycle used here */ - jmp doreti_iret - ISR_VEC(1, apic_isr1) - ISR_VEC(2, apic_isr2) - ISR_VEC(3, apic_isr3) - ISR_VEC(4, apic_isr4) - ISR_VEC(5, apic_isr5) - ISR_VEC(6, apic_isr6) - ISR_VEC(7, apic_isr7) + ISR_VEC 1, apic_isr1 + ISR_VEC 2, apic_isr2 + ISR_VEC 3, apic_isr3 + ISR_VEC 4, apic_isr4 + ISR_VEC 5, apic_isr5 + ISR_VEC 6, apic_isr6 + ISR_VEC 7, apic_isr7 /* * Local APIC periodic timer handler. */ - .text - SUPERALIGN_TEXT -IDTVEC(timerint) - PUSH_FRAME + INTR_HANDLER timerint FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call lapic_handle_timer @@ -137,10 +136,7 @@ IDTVEC(timerint) /* * Local APIC CMCI handler. */ - .text - SUPERALIGN_TEXT -IDTVEC(cmcint) - PUSH_FRAME + INTR_HANDLER cmcint FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_cmc MEXITCOUNT @@ -149,10 +145,7 @@ IDTVEC(cmcint) /* * Local APIC error interrupt handler. */ - .text - SUPERALIGN_TEXT -IDTVEC(errorint) - PUSH_FRAME + INTR_HANDLER errorint FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_error MEXITCOUNT @@ -163,10 +156,7 @@ IDTVEC(errorint) * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ - .text - SUPERALIGN_TEXT -IDTVEC(xen_intr_upcall) - PUSH_FRAME + INTR_HANDLER xen_intr_upcall FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call xen_intr_handle_upcall @@ -183,74 +173,68 @@ IDTVEC(xen_intr_upcall) SUPERALIGN_TEXT invltlb_ret: call as_lapic_eoi - POP_FRAME - jmp doreti_iret + jmp ld_regs SUPERALIGN_TEXT -IDTVEC(invltlb) - PUSH_FRAME - + INTR_HANDLER invltlb call invltlb_handler jmp invltlb_ret -IDTVEC(invltlb_pcid) - PUSH_FRAME - + INTR_HANDLER invltlb_pcid call invltlb_pcid_handler jmp invltlb_ret -IDTVEC(invltlb_invpcid) - PUSH_FRAME - + INTR_HANDLER invltlb_invpcid_nopti call invltlb_invpcid_handler jmp invltlb_ret + INTR_HANDLER invltlb_invpcid_pti + call invltlb_invpcid_pti_handler + jmp invltlb_ret + /* * Single page TLB shootdown */ - .text + INTR_HANDLER invlpg + call invlpg_handler + jmp invltlb_ret - SUPERALIGN_TEXT -IDTVEC(invlpg) - PUSH_FRAME + INTR_HANDLER invlpg_invpcid + call invlpg_invpcid_handler + jmp invltlb_ret - call invlpg_handler + INTR_HANDLER invlpg_pcid + call invlpg_pcid_handler jmp invltlb_ret /* * Page range TLB shootdown. */ - .text - SUPERALIGN_TEXT -IDTVEC(invlrng) - PUSH_FRAME - + INTR_HANDLER invlrng call invlrng_handler jmp invltlb_ret + INTR_HANDLER invlrng_invpcid + call invlrng_invpcid_handler + jmp invltlb_ret + + INTR_HANDLER invlrng_pcid + call invlrng_pcid_handler + jmp invltlb_ret + /* * Invalidate cache. */ - .text - SUPERALIGN_TEXT -IDTVEC(invlcache) - PUSH_FRAME - + INTR_HANDLER invlcache call invlcache_handler jmp invltlb_ret /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ - .text - SUPERALIGN_TEXT -IDTVEC(ipi_intr_bitmap_handler) - PUSH_FRAME - + INTR_HANDLER ipi_intr_bitmap_handler call as_lapic_eoi - FAKE_MCOUNT(TF_RIP(%rsp)) - call ipi_bitmap_handler MEXITCOUNT jmp doreti @@ -258,24 +242,15 @@ IDTVEC(ipi_intr_bitmap_handler) /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ - .text - SUPERALIGN_TEXT -IDTVEC(cpustop) - PUSH_FRAME - + INTR_HANDLER cpustop call as_lapic_eoi - call cpustop_handler jmp doreti /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ - .text - SUPERALIGN_TEXT -IDTVEC(cpususpend) - PUSH_FRAME - + INTR_HANDLER cpususpend call cpususpend_handler call as_lapic_eoi jmp doreti @@ -285,10 +260,7 @@ IDTVEC(cpususpend) * * - Calls the generic rendezvous action function. */ - .text - SUPERALIGN_TEXT -IDTVEC(rendezvous) - PUSH_FRAME + INTR_HANDLER rendezvous #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movq ipi_rendezvous_counts(,%rax,8), %rax @@ -328,4 +300,8 @@ IDTVEC(justreturn) popq %rax jmp doreti_iret + INTR_HANDLER justreturn1 + call as_lapic_eoi + jmp doreti + #endif /* SMP */ diff --git a/sys/amd64/amd64/atpic_vector.S b/sys/amd64/amd64/atpic_vector.S index e7dcbc3..0cc0cd4 100644 --- a/sys/amd64/amd64/atpic_vector.S +++ b/sys/amd64/amd64/atpic_vector.S @@ -36,38 +36,35 @@ * master and slave interrupt controllers. */ -#include - #include "assym.s" +#include /* * Macros for interrupt entry, call to handler, and exit. */ -#define INTR(irq_num, vec_name) \ - .text ; \ - SUPERALIGN_TEXT ; \ -IDTVEC(vec_name) ; \ - PUSH_FRAME ; \ - FAKE_MCOUNT(TF_RIP(%rsp)) ; \ - movq %rsp, %rsi ; \ - movl $irq_num, %edi; /* pass the IRQ */ \ - call atpic_handle_intr ; \ - MEXITCOUNT ; \ + .macro INTR irq_num, vec_name + INTR_HANDLER \vec_name + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp, %rsi + movl $\irq_num, %edi /* pass the IRQ */ + call atpic_handle_intr + MEXITCOUNT jmp doreti + .endm - INTR(0, atpic_intr0) - INTR(1, atpic_intr1) - INTR(2, atpic_intr2) - INTR(3, atpic_intr3) - INTR(4, atpic_intr4) - INTR(5, atpic_intr5) - INTR(6, atpic_intr6) - INTR(7, atpic_intr7) - INTR(8, atpic_intr8) - INTR(9, atpic_intr9) - INTR(10, atpic_intr10) - INTR(11, atpic_intr11) - INTR(12, atpic_intr12) - INTR(13, atpic_intr13) - INTR(14, atpic_intr14) - INTR(15, atpic_intr15) + INTR 0, atpic_intr0 + INTR 1, atpic_intr1 + INTR 2, atpic_intr2 + INTR 3, atpic_intr3 + INTR 4, atpic_intr4 + INTR 5, atpic_intr5 + INTR 6, atpic_intr6 + INTR 7, atpic_intr7 + INTR 8, atpic_intr8 + INTR 9, atpic_intr9 + INTR 10, atpic_intr10 + INTR 11, atpic_intr11 + INTR 12, atpic_intr12 + INTR 13, atpic_intr13 + INTR 14, atpic_intr14 + INTR 15, atpic_intr15 diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index 64a3485..ff47afb 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -191,9 +191,11 @@ do_kthread: done_tss: movq %r8,PCPU(RSP0) movq %r8,PCPU(CURPCB) - /* Update the TSS_RSP0 pointer for the next interrupt */ + /* Update the COMMON_TSS_RSP0 pointer for the next interrupt */ + cmpb $0,pti(%rip) + jne 1f movq %r8,COMMON_TSS_RSP0(%rdx) - movq %r12,PCPU(CURTHREAD) /* into next thread */ +1: movq %r12,PCPU(CURTHREAD) /* into next thread */ /* Test if debug registers should be restored. */ testl $PCB_DBREGS,PCB_FLAGS(%r8) @@ -270,7 +272,12 @@ do_tss: movq %rdx,PCPU(TSSP) shrq $8,%rcx movl %ecx,8(%rax) movb $0x89,5(%rax) /* unset busy */ - movl $TSSSEL,%eax + cmpb $0,pti(%rip) + je 1f + movq PCPU(PRVSPACE),%rax + addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax + movq %rax,COMMON_TSS_RSP0(%rdx) +1: movl $TSSSEL,%eax ltr %ax jmp done_tss diff --git a/sys/amd64/amd64/db_trace.c b/sys/amd64/amd64/db_trace.c index 381237b..d15d207 100644 --- a/sys/amd64/amd64/db_trace.c +++ b/sys/amd64/amd64/db_trace.c @@ -200,6 +200,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, struct thread *td) if (name != NULL) { if (strcmp(name, "calltrap") == 0 || strcmp(name, "fork_trampoline") == 0 || + strcmp(name, "mchk_calltrap") == 0 || strcmp(name, "nmi_calltrap") == 0 || strcmp(name, "Xdblfault") == 0) frame_type = TRAP; diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 2c2b99b..524e729 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -1,12 +1,16 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. - * Copyright (c) 2007 The FreeBSD Foundation + * Copyright (c) 2007-2018 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -38,13 +42,13 @@ #include "opt_compat.h" #include "opt_hwpmc_hooks.h" +#include "assym.s" + #include #include #include #include -#include "assym.s" - #ifdef KDTRACE_HOOKS .bss .globl dtrace_invop_jump_addr @@ -100,68 +104,61 @@ dtrace_invop_calltrap_addr: MCOUNT_LABEL(user) MCOUNT_LABEL(btrap) -/* Traps that we leave interrupts disabled for.. */ -#define TRAP_NOEN(a) \ - subq $TF_RIP,%rsp; \ - movl $(a),TF_TRAPNO(%rsp) ; \ - movq $0,TF_ADDR(%rsp) ; \ - movq $0,TF_ERR(%rsp) ; \ +/* Traps that we leave interrupts disabled for. */ + .macro TRAP_NOEN l, trapno + PTI_ENTRY \l,X\l + .globl X\l + .type X\l,@function +X\l: subq $TF_RIP,%rsp + movl $\trapno,TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) + movq $0,TF_ERR(%rsp) jmp alltraps_noen -IDTVEC(dbg) - TRAP_NOEN(T_TRCTRAP) -IDTVEC(bpt) - TRAP_NOEN(T_BPTFLT) + .endm + + TRAP_NOEN dbg, T_TRCTRAP + TRAP_NOEN bpt, T_BPTFLT #ifdef KDTRACE_HOOKS -IDTVEC(dtrace_ret) - TRAP_NOEN(T_DTRACE_RET) + TRAP_NOEN dtrace_ret, T_DTRACE_RET #endif /* Regular traps; The cpu does not supply tf_err for these. */ -#define TRAP(a) \ - subq $TF_RIP,%rsp; \ - movl $(a),TF_TRAPNO(%rsp) ; \ - movq $0,TF_ADDR(%rsp) ; \ - movq $0,TF_ERR(%rsp) ; \ + .macro TRAP l, trapno + PTI_ENTRY \l,X\l + .globl X\l + .type X\l,@function +X\l: + subq $TF_RIP,%rsp + movl $\trapno,TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) + movq $0,TF_ERR(%rsp) jmp alltraps -IDTVEC(div) - TRAP(T_DIVIDE) -IDTVEC(ofl) - TRAP(T_OFLOW) -IDTVEC(bnd) - TRAP(T_BOUND) -IDTVEC(ill) - TRAP(T_PRIVINFLT) -IDTVEC(dna) - TRAP(T_DNA) -IDTVEC(fpusegm) - TRAP(T_FPOPFLT) -IDTVEC(mchk) - TRAP(T_MCHK) -IDTVEC(rsvd) - TRAP(T_RESERVED) -IDTVEC(fpu) - TRAP(T_ARITHTRAP) -IDTVEC(xmm) - TRAP(T_XMMFLT) - -/* This group of traps have tf_err already pushed by the cpu */ -#define TRAP_ERR(a) \ - subq $TF_ERR,%rsp; \ - movl $(a),TF_TRAPNO(%rsp) ; \ - movq $0,TF_ADDR(%rsp) ; \ + .endm + + TRAP div, T_DIVIDE + TRAP ofl, T_OFLOW + TRAP bnd, T_BOUND + TRAP ill, T_PRIVINFLT + TRAP dna, T_DNA + TRAP fpusegm, T_FPOPFLT + TRAP rsvd, T_RESERVED + TRAP fpu, T_ARITHTRAP + TRAP xmm, T_XMMFLT + +/* This group of traps have tf_err already pushed by the cpu. */ + .macro TRAP_ERR l, trapno + PTI_ENTRY \l,X\l,has_err=1 + .globl X\l + .type X\l,@function +X\l: + subq $TF_ERR,%rsp + movl $\trapno,TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) jmp alltraps -IDTVEC(tss) - TRAP_ERR(T_TSSFLT) -IDTVEC(missing) - subq $TF_ERR,%rsp - movl $T_SEGNPFLT,TF_TRAPNO(%rsp) - jmp prot_addrf -IDTVEC(stk) - subq $TF_ERR,%rsp - movl $T_STKFLT,TF_TRAPNO(%rsp) - jmp prot_addrf -IDTVEC(align) - TRAP_ERR(T_ALIGNFLT) + .endm + + TRAP_ERR tss, T_TSSFLT + TRAP_ERR align, T_ALIGNFLT /* * alltraps entry point. Use swapgs if this is the first time in the @@ -174,25 +171,24 @@ IDTVEC(align) alltraps: movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz alltraps_testi /* already running with kernel GS.base */ + jz 1f /* already running with kernel GS.base */ swapgs movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) -alltraps_testi: - testl $PSL_I,TF_RFLAGS(%rsp) - jz alltraps_pushregs_no_rdi - sti -alltraps_pushregs_no_rdi: - movq %rsi,TF_RSI(%rsp) +1: SAVE_SEGS movq %rdx,TF_RDX(%rsp) + movq %rax,TF_RAX(%rsp) movq %rcx,TF_RCX(%rsp) + testb $SEL_RPL_MASK,TF_CS(%rsp) + jz 2f + call handle_ibrs_entry +2: testl $PSL_I,TF_RFLAGS(%rsp) + jz alltraps_pushregs_no_rax + sti +alltraps_pushregs_no_rax: + movq %rsi,TF_RSI(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) - movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) @@ -248,15 +244,18 @@ calltrap: alltraps_noen: movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 1f /* already running with kernel GS.base */ + jz 1f /* already running with kernel GS.base */ swapgs movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) -1: movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) - jmp alltraps_pushregs_no_rdi +1: SAVE_SEGS + movq %rdx,TF_RDX(%rsp) + movq %rax,TF_RAX(%rsp) + movq %rcx,TF_RCX(%rsp) + testb $SEL_RPL_MASK,TF_CS(%rsp) + jz alltraps_pushregs_no_rax + call handle_ibrs_entry + jmp alltraps_pushregs_no_rax IDTVEC(dblfault) subq $TF_ERR,%rsp @@ -278,70 +277,131 @@ IDTVEC(dblfault) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) cld testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs 1: - movq %rsp,%rdi + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 2f + movq %rax,%cr3 +2: movq %rsp,%rdi call dblfault_handler -2: - hlt - jmp 2b +3: hlt + jmp 3b + ALIGN_TEXT +IDTVEC(page_pti) + testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp) + jz Xpage + swapgs + pushq %rax + pushq %rdx + movq %cr3,%rax + movq %rax,PCPU(SAVED_UCR3) + PTI_UUENTRY has_err=1 + subq $TF_ERR,%rsp + movq %rdi,TF_RDI(%rsp) + movq %rax,TF_RAX(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) + jmp page_u IDTVEC(page) subq $TF_ERR,%rsp - movl $T_PAGEFLT,TF_TRAPNO(%rsp) - movq %rdi,TF_RDI(%rsp) /* free up a GP register */ + movq %rdi,TF_RDI(%rsp) /* free up GP registers */ + movq %rax,TF_RAX(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 1f /* already running with kernel GS.base */ + jz page_cr2 /* already running with kernel GS.base */ swapgs - movq PCPU(CURPCB),%rdi +page_u: movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) -1: movq %cr2,%rdi /* preserve %cr2 before .. */ + movq PCPU(SAVED_UCR3),%rax + movq %rax,PCB_SAVED_UCR3(%rdi) + call handle_ibrs_entry +page_cr2: + movq %cr2,%rdi /* preserve %cr2 before .. */ movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */ - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS + movl $T_PAGEFLT,TF_TRAPNO(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) - jz alltraps_pushregs_no_rdi + jz alltraps_pushregs_no_rax sti - jmp alltraps_pushregs_no_rdi + jmp alltraps_pushregs_no_rax /* * We have to special-case this one. If we get a trap in doreti() at * the iretq stage, we'll reenter with the wrong gs state. We'll have * to do a special the swapgs in this case even coming from the kernel. * XXX linux has a trap handler for their equivalent of load_gs(). + * + * On the stack, we have the hardware interrupt frame to return + * to usermode (faulted) and another frame with error code, for + * fault. For PTI, copy both frames to the main thread stack. */ -IDTVEC(prot) + .macro PROTF_ENTRY name,trapno +\name\()_pti_doreti: + pushq %rax + pushq %rdx + swapgs + movq PCPU(KCR3),%rax + movq %rax,%cr3 + movq PCPU(RSP0),%rax + subq $2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */ + MOVE_STACKS (PTI_SIZE / 4 - 3) + movq %rax,%rsp + popq %rdx + popq %rax + swapgs + jmp X\name +IDTVEC(\name\()_pti) + cmpq $doreti_iret,PTI_RIP-2*8(%rsp) + je \name\()_pti_doreti + testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */ + jz X\name + PTI_UENTRY has_err=1 + swapgs +IDTVEC(\name) subq $TF_ERR,%rsp - movl $T_PROTFLT,TF_TRAPNO(%rsp) + movl $\trapno,TF_TRAPNO(%rsp) + jmp prot_addrf + .endm + + PROTF_ENTRY missing, T_SEGNPFLT + PROTF_ENTRY stk, T_STKFLT + PROTF_ENTRY prot, T_PROTFLT + prot_addrf: movq $0,TF_ADDR(%rsp) movq %rdi,TF_RDI(%rsp) /* free up a GP register */ + movq %rax,TF_RAX(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) leaq doreti_iret(%rip),%rdi cmpq %rdi,TF_RIP(%rsp) - je 1f /* kernel but with user gsbase!! */ + je 5f /* kernel but with user gsbase!! */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 2f /* already running with kernel GS.base */ -1: swapgs -2: movq PCPU(CURPCB),%rdi + jz 6f /* already running with kernel GS.base */ + swapgs + movq PCPU(CURPCB),%rdi +4: call handle_ibrs_entry orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */ - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) - jz alltraps_pushregs_no_rdi + jz alltraps_pushregs_no_rax sti - jmp alltraps_pushregs_no_rdi + jmp alltraps_pushregs_no_rax + +5: swapgs +6: movq PCPU(CURPCB),%rdi + jmp 4b /* * Fast syscall entry point. We enter here with just our new %cs/%ss set, @@ -352,8 +412,18 @@ prot_addrf: * We do not support invoking this from a custom %cs or %ss (e.g. using * entries from an LDT). */ + SUPERALIGN_TEXT +IDTVEC(fast_syscall_pti) + swapgs + movq %rax,PCPU(SCRATCH_RAX) + movq PCPU(KCR3),%rax + movq %rax,%cr3 + jmp fast_syscall_common + SUPERALIGN_TEXT IDTVEC(fast_syscall) swapgs + movq %rax,PCPU(SCRATCH_RAX) +fast_syscall_common: movq %rsp,PCPU(SCRATCH_RSP) movq PCPU(RSP0),%rsp /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */ @@ -363,10 +433,11 @@ IDTVEC(fast_syscall) movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */ movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */ movq %r11,TF_RSP(%rsp) /* user stack pointer */ - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + movq PCPU(SCRATCH_RAX),%rax + movq %rax,TF_RAX(%rsp) /* syscall number */ + movq %rdx,TF_RDX(%rsp) /* arg 3 */ + SAVE_SEGS + call handle_ibrs_entry movq PCPU(CURPCB),%r11 andl $~PCB_FULL_IRET,PCB_FLAGS(%r11) sti @@ -375,11 +446,9 @@ IDTVEC(fast_syscall) movq $2,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) /* arg 1 */ movq %rsi,TF_RSI(%rsp) /* arg 2 */ - movq %rdx,TF_RDX(%rsp) /* arg 3 */ movq %r10,TF_RCX(%rsp) /* arg 4 */ movq %r8,TF_R8(%rsp) /* arg 5 */ movq %r9,TF_R9(%rsp) /* arg 6 */ - movq %rax,TF_RAX(%rsp) /* syscall number */ movq %rbx,TF_RBX(%rsp) /* C preserved */ movq %rbp,TF_RBP(%rsp) /* C preserved */ movq %r12,TF_R12(%rsp) /* C preserved */ @@ -398,11 +467,12 @@ IDTVEC(fast_syscall) /* Disable interrupts before testing PCB_FULL_IRET. */ cli testl $PCB_FULL_IRET,PCB_FLAGS(%rax) - jnz 3f + jnz 4f /* Check for and handle AST's on return to userland. */ movq PCPU(CURTHREAD),%rax testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax) - jne 2f + jne 3f + call handle_ibrs_exit /* Restore preserved registers. */ MEXITCOUNT movq TF_RDI(%rsp),%rdi /* bonus; preserve arg 1 */ @@ -412,16 +482,21 @@ IDTVEC(fast_syscall) movq TF_RFLAGS(%rsp),%r11 /* original %rflags */ movq TF_RIP(%rsp),%rcx /* original %rip */ movq TF_RSP(%rsp),%rsp /* user stack pointer */ - swapgs + cmpb $0,pti + je 2f + movq PCPU(UCR3),%r9 + movq %r9,%cr3 + xorl %r9d,%r9d +2: swapgs sysretq -2: /* AST scheduled. */ +3: /* AST scheduled. */ sti movq %rsp,%rdi call ast jmp 1b -3: /* Requested full context restore, use doreti for that. */ +4: /* Requested full context restore, use doreti for that. */ MEXITCOUNT jmp doreti @@ -477,17 +552,15 @@ IDTVEC(nmi) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) cld xorl %ebx,%ebx testb $SEL_RPL_MASK,TF_CS(%rsp) jnz nmi_fromuserspace /* - * We've interrupted the kernel. Preserve GS.base in %r12. + * We've interrupted the kernel. Preserve GS.base in %r12, + * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d. */ movl $MSR_GSBASE,%ecx rdmsr @@ -499,10 +572,32 @@ IDTVEC(nmi) movl %edx,%eax shrq $32,%rdx wrmsr + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 1f + movq %rax,%cr3 +1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) + je nmi_calltrap + movl $MSR_IA32_SPEC_CTRL,%ecx + rdmsr + movl %eax,%r14d + call handle_ibrs_entry jmp nmi_calltrap nmi_fromuserspace: incl %ebx swapgs + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 1f + movq %rax,%cr3 +1: call handle_ibrs_entry + movq PCPU(CURPCB),%rdi + testq %rdi,%rdi + jz 3f + orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) +3: /* Note: this label is also used by ddb and gdb: */ nmi_calltrap: FAKE_MCOUNT(TF_RIP(%rsp)) @@ -525,26 +620,29 @@ nmi_calltrap: movq PCPU(CURTHREAD),%rax orq %rax,%rax /* curthread present? */ jz nocallchain - testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */ - jz nocallchain /* - * A user callchain is to be captured, so: - * - Move execution to the regular kernel stack, to allow for - * nested NMI interrupts. - * - Take the processor out of "NMI" mode by faking an "iret". - * - Enable interrupts, so that copyin() can work. + * Move execution to the regular kernel stack, because we + * committed to return through doreti. */ movq %rsp,%rsi /* source stack pointer */ movq $TF_SIZE,%rcx movq PCPU(RSP0),%rdx subq %rcx,%rdx movq %rdx,%rdi /* destination stack pointer */ - shrq $3,%rcx /* trap frame size in long words */ cld rep movsq /* copy trapframe */ + movq %rdx,%rsp /* we are on the regular kstack */ + testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */ + jz nocallchain + /* + * A user callchain is to be captured, so: + * - Take the processor out of "NMI" mode by faking an "iret", + * to allow for nested NMI interrupts. + * - Enable interrupts, so that copyin() can work. + */ movl %ss,%eax pushq %rax /* tf_ss */ pushq %rdx /* tf_rsp (on kernel stack) */ @@ -574,33 +672,139 @@ outofnmi: cli nocallchain: #endif - testl %ebx,%ebx + testl %ebx,%ebx /* %ebx == 0 => return to userland */ jnz doreti_exit -nmi_kernelexit: + /* + * Restore speculation control MSR, if preserved. + */ + testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) + je 1f + movl %r14d,%eax + xorl %edx,%edx + movl $MSR_IA32_SPEC_CTRL,%ecx + wrmsr /* * Put back the preserved MSR_GSBASE value. */ +1: movl $MSR_GSBASE,%ecx + movq %r12,%rdx + movl %edx,%eax + shrq $32,%rdx + wrmsr + movq %r13,%cr3 + RESTORE_REGS + addq $TF_RIP,%rsp + jmp doreti_iret + +/* + * MC# handling is similar to NMI. + * + * As with NMIs, machine check exceptions do not respect RFLAGS.IF and + * can occur at any time with a GS.base value that does not correspond + * to the privilege level in CS. + * + * Machine checks are not unblocked by iretq, but it is best to run + * the handler with interrupts disabled since the exception may have + * interrupted a critical section. + * + * The MC# handler runs on its own stack (tss_ist3). The canonical + * GS.base value for the processor is stored just above the bottom of + * its MC# stack. For exceptions taken from kernel mode, the current + * value in the processor's GS.base is saved at entry to C-preserved + * register %r12, the canonical value for GS.base is then loaded into + * the processor, and the saved value is restored at exit time. For + * exceptions taken from user mode, the cheaper 'SWAPGS' instructions + * are used for swapping GS.base. + */ + +IDTVEC(mchk) + subq $TF_RIP,%rsp + movl $(T_MCHK),TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) + movq $0,TF_ERR(%rsp) + movq %rdi,TF_RDI(%rsp) + movq %rsi,TF_RSI(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) + movq %r8,TF_R8(%rsp) + movq %r9,TF_R9(%rsp) + movq %rax,TF_RAX(%rsp) + movq %rbx,TF_RBX(%rsp) + movq %rbp,TF_RBP(%rsp) + movq %r10,TF_R10(%rsp) + movq %r11,TF_R11(%rsp) + movq %r12,TF_R12(%rsp) + movq %r13,TF_R13(%rsp) + movq %r14,TF_R14(%rsp) + movq %r15,TF_R15(%rsp) + SAVE_SEGS + movl $TF_HASSEGS,TF_FLAGS(%rsp) + cld + xorl %ebx,%ebx + testb $SEL_RPL_MASK,TF_CS(%rsp) + jnz mchk_fromuserspace + /* + * We've interrupted the kernel. Preserve GS.base in %r12, + * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d. + */ movl $MSR_GSBASE,%ecx + rdmsr + movq %rax,%r12 + shlq $32,%rdx + orq %rdx,%r12 + /* Retrieve and load the canonical value for GS.base. */ + movq TF_SIZE(%rsp),%rdx + movl %edx,%eax + shrq $32,%rdx + wrmsr + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 1f + movq %rax,%cr3 +1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) + je mchk_calltrap + movl $MSR_IA32_SPEC_CTRL,%ecx + rdmsr + movl %eax,%r14d + call handle_ibrs_entry + jmp mchk_calltrap +mchk_fromuserspace: + incl %ebx + swapgs + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 1f + movq %rax,%cr3 +1: call handle_ibrs_entry +/* Note: this label is also used by ddb and gdb: */ +mchk_calltrap: + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp,%rdi + call mca_intr + MEXITCOUNT + testl %ebx,%ebx /* %ebx == 0 => return to userland */ + jnz doreti_exit + /* + * Restore speculation control MSR, if preserved. + */ + testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) + je 1f + movl %r14d,%eax + xorl %edx,%edx + movl $MSR_IA32_SPEC_CTRL,%ecx + wrmsr + /* + * Put back the preserved MSR_GSBASE value. + */ +1: movl $MSR_GSBASE,%ecx movq %r12,%rdx movl %edx,%eax shrq $32,%rdx wrmsr -nmi_restoreregs: - movq TF_RDI(%rsp),%rdi - movq TF_RSI(%rsp),%rsi - movq TF_RDX(%rsp),%rdx - movq TF_RCX(%rsp),%rcx - movq TF_R8(%rsp),%r8 - movq TF_R9(%rsp),%r9 - movq TF_RAX(%rsp),%rax - movq TF_RBX(%rsp),%rbx - movq TF_RBP(%rsp),%rbp - movq TF_R10(%rsp),%r10 - movq TF_R11(%rsp),%r11 - movq TF_R12(%rsp),%r12 - movq TF_R13(%rsp),%r13 - movq TF_R14(%rsp),%r14 - movq TF_R15(%rsp),%r15 + movq %r13,%cr3 + RESTORE_REGS addq $TF_RIP,%rsp jmp doreti_iret @@ -767,27 +971,39 @@ ld_es: ld_ds: movw TF_DS(%rsp),%ds ld_regs: - movq TF_RDI(%rsp),%rdi - movq TF_RSI(%rsp),%rsi - movq TF_RDX(%rsp),%rdx - movq TF_RCX(%rsp),%rcx - movq TF_R8(%rsp),%r8 - movq TF_R9(%rsp),%r9 - movq TF_RAX(%rsp),%rax - movq TF_RBX(%rsp),%rbx - movq TF_RBP(%rsp),%rbp - movq TF_R10(%rsp),%r10 - movq TF_R11(%rsp),%r11 - movq TF_R12(%rsp),%r12 - movq TF_R13(%rsp),%r13 - movq TF_R14(%rsp),%r14 - movq TF_R15(%rsp),%r15 + RESTORE_REGS testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 1f /* keep running with kernel GS.base */ + jz 2f /* keep running with kernel GS.base */ cli + call handle_ibrs_exit_rs + cmpb $0,pti + je 1f + pushq %rdx + movq PCPU(PRVSPACE),%rdx + addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8-PTI_SIZE,%rdx + movq %rax,PTI_RAX(%rdx) + popq %rax + movq %rax,PTI_RDX(%rdx) + movq TF_RIP(%rsp),%rax + movq %rax,PTI_RIP(%rdx) + movq TF_CS(%rsp),%rax + movq %rax,PTI_CS(%rdx) + movq TF_RFLAGS(%rsp),%rax + movq %rax,PTI_RFLAGS(%rdx) + movq TF_RSP(%rsp),%rax + movq %rax,PTI_RSP(%rdx) + movq TF_SS(%rsp),%rax + movq %rax,PTI_SS(%rdx) + movq PCPU(UCR3),%rax swapgs -1: - addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ + movq %rdx,%rsp + movq %rax,%cr3 + popq %rdx + popq %rax + addq $8,%rsp + jmp doreti_iret +1: swapgs +2: addq $TF_RIP,%rsp .globl doreti_iret doreti_iret: iretq @@ -811,22 +1027,20 @@ set_segs: .globl doreti_iret_fault doreti_iret_fault: subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */ - testl $PSL_I,TF_RFLAGS(%rsp) + movq %rax,TF_RAX(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) + call handle_ibrs_entry + testb $SEL_RPL_MASK,TF_CS(%rsp) jz 1f sti 1: - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) - movq %rdx,TF_RDX(%rsp) - movq %rcx,TF_RCX(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) - movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) @@ -845,7 +1059,7 @@ doreti_iret_fault: .globl ds_load_fault ds_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) - testl $PSL_I,TF_RFLAGS(%rsp) + testb $SEL_RPL_MASK,TF_CS(%rsp) jz 1f sti 1: diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index d087fdc..94382ca 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -145,6 +145,7 @@ ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); +ASSYM(PCB_SAVED_UCR3, offsetof(struct pcb, pcb_saved_ucr3)); ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer)); @@ -190,6 +191,16 @@ ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags)); ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_HASSEGS, TF_HASSEGS); +ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx)); +ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax)); +ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err)); +ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip)); +ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs)); +ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags)); +ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp)); +ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss)); +ASSYM(PTI_SIZE, sizeof(struct pti_frame)); + ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags)); @@ -206,6 +217,7 @@ ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); +ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); @@ -215,6 +227,12 @@ ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt)); +ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3)); +ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3)); +ASSYM(PC_SAVED_UCR3, offsetof(struct pcpu, pc_saved_ucr3)); +ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack)); +ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ); +ASSYM(PC_IBPB_SET, offsetof(struct pcpu, pc_ibpb_set)); ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL); ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL); diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 63c0f20..2818111 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -194,6 +194,7 @@ initializecpu(void) wrmsr(MSR_EFER, msr); pg_nx = PG_NX; } + hw_ibrs_recalculate(); switch (cpu_vendor_id) { case CPU_VENDOR_AMD: init_amd(); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 2c18af9..dd5bb06 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -114,6 +114,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -149,6 +150,14 @@ __FBSDID("$FreeBSD$"); /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); +/* + * The PTI trampoline stack needs enough space for a hardware trapframe and a + * couple of scratch registers, as well as the trapframe left behind after an + * iret fault. + */ +CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - + offsetof(struct pti_frame, pti_rip)); + extern u_int64_t hammer_time(u_int64_t, u_int64_t); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) @@ -180,12 +189,6 @@ struct init_ops init_ops = { .msi_init = msi_init, }; -/* - * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is - * the physical address at which the kernel is loaded. - */ -extern char kernphys[]; - struct msgbuf *msgbufp; /* @@ -670,7 +673,7 @@ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); - +static char mce0_stack[PAGE_SIZE] __aligned(16); static char nmi0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); @@ -824,13 +827,20 @@ extern inthand_t IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), + IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti), + IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), + IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), + IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), + IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), + IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS - IDTVEC(dtrace_ret), + IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), #endif #ifdef XENHVM - IDTVEC(xen_intr_upcall), + IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), #endif - IDTVEC(fast_syscall), IDTVEC(fast_syscall32); + IDTVEC(fast_syscall), IDTVEC(fast_syscall32), + IDTVEC(fast_syscall_pti); #ifdef DDB /* @@ -1523,6 +1533,23 @@ amd64_kdb_init(void) #endif } +/* Set up the fast syscall stuff */ +void +amd64_conf_fast_syscall(void) +{ + uint64_t msr; + + msr = rdmsr(MSR_EFER) | EFER_SCE; + wrmsr(MSR_EFER, msr); + wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : + (u_int64_t)IDTVEC(fast_syscall)); + wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); + msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | + ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); + wrmsr(MSR_STAR, msr); + wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D); +} + u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { @@ -1531,7 +1558,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) struct pcpu *pc; struct nmi_pcpu *np; struct xstate_hdr *xhdr; - u_int64_t msr; + u_int64_t rsp0; char *env; size_t kstack0_sz; int late_console; @@ -1544,6 +1571,8 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) kmdp = init_ops.parse_preload_data(modulep); + identify_cpu1(); + /* Init basic tunables, hz etc */ init_param1(); @@ -1600,34 +1629,55 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ + pti = pti_get_default(); + TUNABLE_INT_FETCH("vm.pmap.pti", &pti); + for (x = 0; x < NIDT; x++) - setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); + setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT, + SEL_KPL, 0); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); - setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); - setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, + SEL_UPL, 0); + setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, + SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); - setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), + SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), + SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); + setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, + SEL_KPL, 0); #ifdef KDTRACE_HOOKS - setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); + setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : + &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM - setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0); + setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : + &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); #endif - r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); @@ -1648,7 +1698,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) != NULL) vty_set_preferred(VTY_VT); - identify_cpu(); /* Final stage of CPU initialization */ + finishidentcpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ initializecpucache(); @@ -1663,21 +1713,21 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) np->np_pcpu = (register_t) pc; common_tss[0].tss_ist2 = (long) np; + /* + * MC# stack, runs on ist3. The pcpu pointer is stored just + * above the start of the ist3 stack. + */ + np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; + np->np_pcpu = (register_t) pc; + common_tss[0].tss_ist3 = (long) np; + /* Set the IO permission bitmap (empty due to tss seg limit) */ common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); - /* Set up the fast syscall stuff */ - msr = rdmsr(MSR_EFER) | EFER_SCE; - wrmsr(MSR_EFER, msr); - wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); - wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); - msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | - ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); - wrmsr(MSR_STAR, msr); - wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); + amd64_conf_fast_syscall(); /* * Temporary forge some valid pointer to PCB, for exception @@ -1749,10 +1799,12 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) xhdr->xstate_bv = xsave_mask; } /* make an initial tss so cpu can get interrupt stack on syscall! */ - common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb; + rsp0 = (vm_offset_t)thread0.td_pcb; /* Ensure the stack is aligned to 16 bytes */ - common_tss[0].tss_rsp0 &= ~0xFul; - PCPU_SET(rsp0, common_tss[0].tss_rsp0); + rsp0 &= ~0xFul; + common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) + + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0; + PCPU_SET(rsp0, rsp0); PCPU_SET(curpcb, thread0.td_pcb); /* transfer to user mode */ @@ -1782,6 +1834,8 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) #endif thread0.td_critnest = 0; + TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); + /* Location of kernel stack for locore */ return ((u_int64_t)thread0.td_pcb); } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index ce07e0f..450d512 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -85,10 +85,9 @@ extern struct pcpu __pcpu[]; /* Temporary variables for init_secondary() */ char *doublefault_stack; +char *mce_stack; char *nmi_stack; -extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); - /* * Local data and functions. */ @@ -132,33 +131,50 @@ cpu_mp_start(void) /* Install an inter-CPU IPI for TLB invalidation */ if (pmap_pcid_enabled) { if (invpcid_works) { - setidt(IPI_INVLTLB, IDTVEC(invltlb_invpcid), - SDT_SYSIGT, SEL_KPL, 0); - } else { - setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT, + setidt(IPI_INVLTLB, pti ? + IDTVEC(invltlb_invpcid_pti_pti) : + IDTVEC(invltlb_invpcid_nopti), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_invpcid_pti) : + IDTVEC(invlpg_invpcid), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_invpcid_pti) : + IDTVEC(invlrng_invpcid), SDT_SYSIGT, SEL_KPL, 0); + } else { + setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) : + IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pcid_pti) : + IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pcid_pti) : + IDTVEC(invlrng_pcid), SDT_SYSIGT, SEL_KPL, 0); } } else { - setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb), + SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg), + SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng), + SDT_SYSIGT, SEL_KPL, 0); } - setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); - setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for cache invalidation. */ - setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache), + SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for all-CPU rendezvous */ - setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) : + IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); /* Install generic inter-CPU IPI handler */ - setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), - SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) : + IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU stop/restart */ - setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), + SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU suspend/resume */ - setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), + SDT_SYSIGT, SEL_KPL, 0); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { @@ -188,7 +204,7 @@ init_secondary(void) { struct pcpu *pc; struct nmi_pcpu *np; - u_int64_t msr, cr0; + u_int64_t cr0; int cpu, gsel_tss, x; struct region_descriptor ap_gdt; @@ -197,7 +213,6 @@ init_secondary(void) /* Init tss */ common_tss[cpu] = common_tss[0]; - common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; @@ -206,6 +221,10 @@ init_secondary(void) np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; common_tss[cpu].tss_ist2 = (long) np; + /* The MC# stack runs on IST3. */ + np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1; + common_tss[cpu].tss_ist3 = (long) np; + /* Prepare private GDT */ gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; for (x = 0; x < NGDT; x++) { @@ -240,8 +259,15 @@ init_secondary(void) pc->pc_curpmap = kernel_pmap; pc->pc_pcid_gen = 1; pc->pc_pcid_next = PMAP_PCID_KERN + 1; + common_tss[cpu].tss_rsp0 = pti ? ((vm_offset_t)&pc->pc_pti_stack + + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0; /* Save the per-cpu pointer for use by the NMI handler. */ + np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; + np->np_pcpu = (register_t) pc; + + /* Save the per-cpu pointer for use by the MC# handler. */ + np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1; np->np_pcpu = (register_t) pc; wrmsr(MSR_FSBASE, 0); /* User value */ @@ -263,15 +289,7 @@ init_secondary(void) cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); - /* Set up the fast syscall stuff */ - msr = rdmsr(MSR_EFER) | EFER_SCE; - wrmsr(MSR_EFER, msr); - wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); - wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); - msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | - ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); - wrmsr(MSR_STAR, msr); - wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); + amd64_conf_fast_syscall(); /* signal our startup to the BSP. */ mp_naps++; @@ -346,6 +364,8 @@ native_start_all_aps(void) kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO); doublefault_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); + mce_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, + M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, @@ -428,9 +448,43 @@ invltlb_invpcid_handler(void) } void -invltlb_pcid_handler(void) +invltlb_invpcid_pti_handler(void) { + struct invpcid_descr d; uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_gbl[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + generation = smp_tlb_generation; + d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; + d.pad = 0; + d.addr = 0; + if (smp_tlb_pmap == kernel_pmap) { + /* + * This invalidation actually needs to clear kernel + * mappings from the TLB in the current pmap, but + * since we were asked for the flush in the kernel + * pmap, achieve it by performing global flush. + */ + invpcid(&d, INVPCID_CTXGLOB); + } else { + invpcid(&d, INVPCID_CTX); + d.pcid |= PMAP_PCID_USER_PT; + invpcid(&d, INVPCID_CTX); + } + PCPU_SET(smp_tlb_done, generation); +} + +void +invltlb_pcid_handler(void) +{ + uint64_t kcr3, ucr3; + uint32_t generation, pcid; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; @@ -451,9 +505,132 @@ invltlb_pcid_handler(void) * CPU. */ if (PCPU_GET(curpmap) == smp_tlb_pmap) { - load_cr3(smp_tlb_pmap->pm_cr3 | - smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid); + pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; + kcr3 = smp_tlb_pmap->pm_cr3 | pcid; + ucr3 = smp_tlb_pmap->pm_ucr3; + if (ucr3 != PMAP_NO_CR3) { + ucr3 |= PMAP_PCID_USER_PT | pcid; + pmap_pti_pcid_invalidate(ucr3, kcr3); + } else + load_cr3(kcr3); } } PCPU_SET(smp_tlb_done, generation); } + +void +invlpg_invpcid_handler(void) +{ + struct invpcid_descr d; + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_pg[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + generation = smp_tlb_generation; /* Overlap with serialization */ + invlpg(smp_tlb_addr1); + if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) { + d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | + PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = smp_tlb_addr1; + invpcid(&d, INVPCID_ADDR); + } + PCPU_SET(smp_tlb_done, generation); +} + +void +invlpg_pcid_handler(void) +{ + uint64_t kcr3, ucr3; + uint32_t generation; + uint32_t pcid; + +#ifdef COUNT_XINVLTLB_HITS + xhits_pg[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + generation = smp_tlb_generation; /* Overlap with serialization */ + invlpg(smp_tlb_addr1); + if (smp_tlb_pmap == PCPU_GET(curpmap) && + (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) { + pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; + kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1); + } + PCPU_SET(smp_tlb_done, generation); +} + +void +invlrng_invpcid_handler(void) +{ + struct invpcid_descr d; + vm_offset_t addr, addr2; + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_rng[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + addr = smp_tlb_addr1; + addr2 = smp_tlb_addr2; + generation = smp_tlb_generation; /* Overlap with serialization */ + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < addr2); + if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) { + d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | + PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = smp_tlb_addr1; + do { + invpcid(&d, INVPCID_ADDR); + d.addr += PAGE_SIZE; + } while (d.addr < addr2); + } + PCPU_SET(smp_tlb_done, generation); +} + +void +invlrng_pcid_handler(void) +{ + vm_offset_t addr, addr2; + uint64_t kcr3, ucr3; + uint32_t generation; + uint32_t pcid; + +#ifdef COUNT_XINVLTLB_HITS + xhits_rng[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + addr = smp_tlb_addr1; + addr2 = smp_tlb_addr2; + generation = smp_tlb_generation; /* Overlap with serialization */ + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < addr2); + if (smp_tlb_pmap == PCPU_GET(curpmap) && + (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) { + pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; + kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2); + } + PCPU_SET(smp_tlb_done, generation); +} diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index a7ce847..2989eb40 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -9,11 +9,17 @@ * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. + * Copyright (c) 2014-2018 The FreeBSD Foundation + * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -147,6 +153,7 @@ __FBSDID("$FreeBSD$"); #ifdef SMP #include #endif +#include static __inline boolean_t pmap_type_guest(pmap_t pmap) @@ -208,6 +215,8 @@ pmap_rw_bit(pmap_t pmap) return (mask); } +static pt_entry_t pg_g; + static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { @@ -215,7 +224,7 @@ pmap_global_bit(pmap_t pmap) switch (pmap->pm_type) { case PT_X86: - mask = X86_PG_G; + mask = pg_g; break; case PT_RVI: case PT_EPT: @@ -405,6 +414,15 @@ int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int pti = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pti, 0, + "Page Table Isolation enabled"); +static vm_object_t pti_obj; +static pml4_entry_t *pti_pml4; +static vm_pindex_t pti_pg_idx; +static bool pti_finalized; + static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { @@ -622,6 +640,11 @@ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); +static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, + bool exec); +static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); +static pd_entry_t *pmap_pti_pde(vm_offset_t va); +static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, @@ -901,7 +924,7 @@ create_pagetables(vm_paddr_t *firstaddr) /* XXX not fully used, underneath 2M pages */ pt_p = (pt_entry_t *)KPTphys; for (i = 0; ptoa(i) < *firstaddr; i++) - pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; + pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g; /* Now map the page tables at their location within PTmap */ pd_p = (pd_entry_t *)KPDphys; @@ -912,7 +935,7 @@ create_pagetables(vm_paddr_t *firstaddr) /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | - X86_PG_G; + pg_g; /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); @@ -932,14 +955,14 @@ create_pagetables(vm_paddr_t *firstaddr) for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A; } for (j = 0; i < ndmpdp; i++, j++) { @@ -982,6 +1005,9 @@ pmap_bootstrap(vm_paddr_t *firstaddr) pt_entry_t *pte; int i; + if (!pti) + pg_g = X86_PG_G; + /* * Create an initial set of page tables to run the kernel in. */ @@ -1014,6 +1040,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_cr3 = KPML4phys; + kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_flags = pmap_flags; @@ -1528,6 +1555,9 @@ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask; + struct invpcid_descr d; + uint64_t kcr3, ucr3; + uint32_t pcid; u_int cpuid, i; if (pmap_type_guest(pmap)) { @@ -1544,9 +1574,32 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) + if (pmap == PCPU_GET(curpmap)) { invlpg(va); - else if (pmap_pcid_enabled) + if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { + /* + * Disable context switching. pm_pcid + * is recalculated on switch, which + * might make us use wrong pcid below. + */ + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + + if (invpcid_works) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | + CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlpg(ucr3, kcr3, va); + } + critical_exit(); + } + } else if (pmap_pcid_enabled) pmap->pm_pcids[cpuid].pm_gen = 0; if (pmap_pcid_enabled) { CPU_FOREACH(i) { @@ -1556,7 +1609,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) } mask = &pmap->pm_active; } - smp_masked_invlpg(*mask, va); + smp_masked_invlpg(*mask, va, pmap); sched_unpin(); } @@ -1567,7 +1620,10 @@ void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask; + struct invpcid_descr d; vm_offset_t addr; + uint64_t kcr3, ucr3; + uint32_t pcid; u_int cpuid, i; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { @@ -1593,6 +1649,26 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); + if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = sva; + for (; d.addr < eva; d.addr += + PAGE_SIZE) + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | + CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlrng(ucr3, kcr3, sva, + eva); + } + critical_exit(); + } } else if (pmap_pcid_enabled) { pmap->pm_pcids[cpuid].pm_gen = 0; } @@ -1604,7 +1680,7 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) } mask = &pmap->pm_active; } - smp_masked_invlpg_range(*mask, sva, eva); + smp_masked_invlpg_range(*mask, sva, eva, pmap); sched_unpin(); } @@ -1613,6 +1689,8 @@ pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask; struct invpcid_descr d; + uint64_t kcr3, ucr3; + uint32_t pcid; u_int cpuid, i; if (pmap_type_guest(pmap)) { @@ -1636,15 +1714,29 @@ pmap_invalidate_all(pmap_t pmap) cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { + critical_enter(); + pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works) { - d.pcid = pmap->pm_pcids[cpuid].pm_pcid; + d.pcid = pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + d.pcid |= PMAP_PCID_USER_PT; + invpcid(&d, INVPCID_CTX); + } } else { - load_cr3(pmap->pm_cr3 | pmap->pm_pcids - [PCPU_GET(cpuid)].pm_pcid); + kcr3 = pmap->pm_cr3 | pcid; + ucr3 = pmap->pm_ucr3; + if (ucr3 != PMAP_NO_CR3) { + ucr3 |= pcid | PMAP_PCID_USER_PT; + pmap_pti_pcid_invalidate(ucr3, + kcr3); + } else { + load_cr3(kcr3); + } } + critical_exit(); } else { invltlb(); } @@ -1749,6 +1841,9 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { + struct invpcid_descr d; + uint64_t kcr3, ucr3; + uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; @@ -1757,16 +1852,35 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); - if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) + if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { invlpg(va); - else if (pmap_pcid_enabled) + if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && + pmap->pm_ucr3 != PMAP_NO_CR3) { + critical_enter(); + pcid = pmap->pm_pcids[0].pm_pcid; + if (invpcid_works) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | + PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlpg(ucr3, kcr3, va); + } + critical_exit(); + } + } else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { + struct invpcid_descr d; vm_offset_t addr; + uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; @@ -1778,6 +1892,25 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); + if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && + pmap->pm_ucr3 != PMAP_NO_CR3) { + critical_enter(); + if (invpcid_works) { + d.pcid = pmap->pm_pcids[0].pm_pcid | + PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = sva; + for (; d.addr < eva; d.addr += PAGE_SIZE) + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. + pm_pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. + pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); + } + critical_exit(); + } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } @@ -1787,6 +1920,7 @@ void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; + uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; @@ -1804,15 +1938,26 @@ pmap_invalidate_all(pmap_t pmap) } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { + critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + d.pcid |= PMAP_PCID_USER_PT; + invpcid(&d, INVPCID_CTX); + } } else { - load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0]. - pm_pcid); + kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ + 0].pm_pcid | PMAP_PCID_USER_PT; + pmap_pti_pcid_invalidate(ucr3, kcr3); + } else + load_cr3(kcr3); } + critical_exit(); } else { invltlb(); } @@ -2094,7 +2239,7 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa) pt_entry_t *pte; pte = vtopte(va); - pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); } static __inline void @@ -2105,7 +2250,7 @@ pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); - pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); } /* @@ -2165,7 +2310,7 @@ pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; - pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); + pte_store(pte, pa | pg_g | X86_PG_RW | X86_PG_V); } pte++; } @@ -2284,6 +2429,10 @@ _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; + if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { + pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; + *pml4 = 0; + } } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; @@ -2349,7 +2498,10 @@ pmap_pinit0(pmap_t pmap) PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); + pmap->pm_pml4u = NULL; pmap->pm_cr3 = KPML4phys; + /* hack to keep pmap_pti_pcid_invalidate() alive */ + pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); @@ -2358,6 +2510,8 @@ pmap_pinit0(pmap_t pmap) CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; + if (!pti) + __pcpu[i].pc_kcr3 = PMAP_NO_CR3; } PCPU_SET(curpmap, kernel_pmap); pmap_activate(curthread); @@ -2387,6 +2541,17 @@ pmap_pinit_pml4(vm_page_t pml4pg) X86_PG_A | X86_PG_M; } +static void +pmap_pinit_pml4_pti(vm_page_t pml4pg) +{ + pml4_entry_t *pm_pml4; + int i; + + pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); + for (i = 0; i < NPML4EPG; i++) + pm_pml4[i] = pti_pml4[i]; +} + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. @@ -2394,7 +2559,7 @@ pmap_pinit_pml4(vm_page_t pml4pg) int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { - vm_page_t pml4pg; + vm_page_t pml4pg, pml4pgu; vm_paddr_t pml4phys; int i; @@ -2411,8 +2576,11 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } - pmap->pm_cr3 = ~0; /* initialize to an invalid value */ + pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ + pmap->pm_ucr3 = PMAP_NO_CR3; + pmap->pm_pml4u = NULL; + pmap->pm_type = pm_type; if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); @@ -2420,10 +2588,21 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. + * Install minimal kernel mappings in PTI case. */ - if ((pmap->pm_type = pm_type) == PT_X86) { + if (pm_type == PT_X86) { pmap->pm_cr3 = pml4phys; pmap_pinit_pml4(pml4pg); + if (pti) { + while ((pml4pgu = vm_page_alloc(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) + == NULL) + VM_WAIT; + pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(pml4pgu)); + pmap_pinit_pml4_pti(pml4pgu); + pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); + } } pmap->pm_root.rt_root = 0; @@ -2495,13 +2674,27 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) */ if (ptepindex >= (NUPDE + NUPDPE)) { - pml4_entry_t *pml4; + pml4_entry_t *pml4, *pml4u; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; + if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { + /* + * PTI: Make all user-space mappings in the + * kernel-mode page table no-execute so that + * we detect any programming errors that leave + * the kernel-mode page table active on return + * to user space. + */ + *pml4 |= pg_nx; + + pml4u = &pmap->pm_pml4u[pml4index]; + *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | + PG_A | PG_M; + } } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; @@ -2702,6 +2895,13 @@ pmap_release(pmap_t pmap) m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); + + if (pmap->pm_pml4u != NULL) { + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); + m->wire_count--; + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + vm_page_free(m); + } } static int @@ -6867,13 +7067,15 @@ pmap_pcid_alloc(pmap_t pmap, u_int cpuid) CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); - if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN || - pmap->pm_pcids[cpuid].pm_gen == gen) + if (!pti && (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN || + pmap->pm_pcids[cpuid].pm_gen == gen)) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); - KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x", - cpuid, pcid_next)); - if (pcid_next == PMAP_PCID_OVERMAX) { + KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || + (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), + ("cpu %d pcid_next %#x", cpuid, pcid_next)); + if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || + (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; @@ -6892,7 +7094,8 @@ void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; - uint64_t cached, cr3; + struct invpcid_descr d; + uint64_t cached, cr3, kcr3, ucr3; register_t rflags; u_int cpuid; @@ -6948,11 +7151,41 @@ pmap_activate_sw(struct thread *td) PCPU_INC(pm_save_cnt); } PCPU_SET(curpmap, pmap); + if (pti) { + kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; + ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | + PMAP_PCID_USER_PT; + + /* + * Manually invalidate translations cached + * from the user page table, which are not + * flushed by reload of cr3 with the kernel + * page table pointer above. + */ + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + if (invpcid_works) { + d.pcid = PMAP_PCID_USER_PT | + pmap->pm_pcids[cpuid].pm_pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + } else { + pmap_pti_pcid_invalidate(ucr3, kcr3); + } + } + + PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); + PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); + } if (!invpcid_works) intr_restore(rflags); } else if (cr3 != pmap->pm_cr3) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); + if (pti) { + PCPU_SET(kcr3, pmap->pm_cr3); + PCPU_SET(ucr3, pmap->pm_ucr3); + } } #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); @@ -7271,6 +7504,291 @@ pmap_quick_remove_page(vm_offset_t addr) mtx_unlock_spin(&qframe_mtx); } +static vm_page_t +pmap_pti_alloc_page(void) +{ + vm_page_t m; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | + VM_ALLOC_WIRED | VM_ALLOC_ZERO); + return (m); +} + +static bool +pmap_pti_free_page(vm_page_t m) +{ + + KASSERT(m->wire_count > 0, ("page %p not wired", m)); + m->wire_count--; + if (m->wire_count != 0) + return (false); + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + return (true); +} + +static void +pmap_pti_init(void) +{ + vm_page_t pml4_pg; + pdp_entry_t *pdpe; + vm_offset_t va; + int i; + + if (!pti) + return; + pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); + VM_OBJECT_WLOCK(pti_obj); + pml4_pg = pmap_pti_alloc_page(); + pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); + for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && + va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { + pdpe = pmap_pti_pdpe(va); + pmap_pti_wire_pte(pdpe); + } + pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], + (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); + pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + + sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); + pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + + sizeof(struct gate_descriptor) * NIDT, false); + pmap_pti_add_kva_locked((vm_offset_t)common_tss, + (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); + CPU_FOREACH(i) { + /* Doublefault stack IST 1 */ + va = common_tss[i].tss_ist1; + pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); + /* NMI stack IST 2 */ + va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); + pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); + /* MC# stack IST 3 */ + va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); + pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); + } + pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, + (vm_offset_t)etext, true); + pti_finalized = true; + VM_OBJECT_WUNLOCK(pti_obj); +} +SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); + +static pdp_entry_t * +pmap_pti_pdpe(vm_offset_t va) +{ + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + vm_page_t m; + vm_pindex_t pml4_idx; + vm_paddr_t mphys; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + pml4_idx = pmap_pml4e_index(va); + pml4e = &pti_pml4[pml4_idx]; + m = NULL; + if (*pml4e == 0) { + if (pti_finalized) + panic("pml4 alloc after finalization\n"); + m = pmap_pti_alloc_page(); + if (*pml4e != 0) { + pmap_pti_free_page(m); + mphys = *pml4e & ~PAGE_MASK; + } else { + mphys = VM_PAGE_TO_PHYS(m); + *pml4e = mphys | X86_PG_RW | X86_PG_V; + } + } else { + mphys = *pml4e & ~PAGE_MASK; + } + pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); + return (pdpe); +} + +static void +pmap_pti_wire_pte(void *pte) +{ + vm_page_t m; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); + m->wire_count++; +} + +static void +pmap_pti_unwire_pde(void *pde, bool only_ref) +{ + vm_page_t m; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); + MPASS(m->wire_count > 0); + MPASS(only_ref || m->wire_count > 1); + pmap_pti_free_page(m); +} + +static void +pmap_pti_unwire_pte(void *pte, vm_offset_t va) +{ + vm_page_t m; + pd_entry_t *pde; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); + MPASS(m->wire_count > 0); + if (pmap_pti_free_page(m)) { + pde = pmap_pti_pde(va); + MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); + *pde = 0; + pmap_pti_unwire_pde(pde, false); + } +} + +static pd_entry_t * +pmap_pti_pde(vm_offset_t va) +{ + pdp_entry_t *pdpe; + pd_entry_t *pde; + vm_page_t m; + vm_pindex_t pd_idx; + vm_paddr_t mphys; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + pdpe = pmap_pti_pdpe(va); + if (*pdpe == 0) { + m = pmap_pti_alloc_page(); + if (*pdpe != 0) { + pmap_pti_free_page(m); + MPASS((*pdpe & X86_PG_PS) == 0); + mphys = *pdpe & ~PAGE_MASK; + } else { + mphys = VM_PAGE_TO_PHYS(m); + *pdpe = mphys | X86_PG_RW | X86_PG_V; + } + } else { + MPASS((*pdpe & X86_PG_PS) == 0); + mphys = *pdpe & ~PAGE_MASK; + } + + pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); + pd_idx = pmap_pde_index(va); + pde += pd_idx; + return (pde); +} + +static pt_entry_t * +pmap_pti_pte(vm_offset_t va, bool *unwire_pde) +{ + pd_entry_t *pde; + pt_entry_t *pte; + vm_page_t m; + vm_paddr_t mphys; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + pde = pmap_pti_pde(va); + if (unwire_pde != NULL) { + *unwire_pde = true; + pmap_pti_wire_pte(pde); + } + if (*pde == 0) { + m = pmap_pti_alloc_page(); + if (*pde != 0) { + pmap_pti_free_page(m); + MPASS((*pde & X86_PG_PS) == 0); + mphys = *pde & ~(PAGE_MASK | pg_nx); + } else { + mphys = VM_PAGE_TO_PHYS(m); + *pde = mphys | X86_PG_RW | X86_PG_V; + if (unwire_pde != NULL) + *unwire_pde = false; + } + } else { + MPASS((*pde & X86_PG_PS) == 0); + mphys = *pde & ~(PAGE_MASK | pg_nx); + } + + pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); + pte += pmap_pte_index(va); + + return (pte); +} + +static void +pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) +{ + vm_paddr_t pa; + pd_entry_t *pde; + pt_entry_t *pte, ptev; + bool unwire_pde; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + sva = trunc_page(sva); + MPASS(sva > VM_MAXUSER_ADDRESS); + eva = round_page(eva); + MPASS(sva < eva); + for (; sva < eva; sva += PAGE_SIZE) { + pte = pmap_pti_pte(sva, &unwire_pde); + pa = pmap_kextract(sva); + ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | + (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, + VM_MEMATTR_DEFAULT, FALSE); + if (*pte == 0) { + pte_store(pte, ptev); + pmap_pti_wire_pte(pte); + } else { + KASSERT(!pti_finalized, + ("pti overlap after fin %#lx %#lx %#lx", + sva, *pte, ptev)); + KASSERT(*pte == ptev, + ("pti non-identical pte after fin %#lx %#lx %#lx", + sva, *pte, ptev)); + } + if (unwire_pde) { + pde = pmap_pti_pde(sva); + pmap_pti_unwire_pde(pde, true); + } + } +} + +void +pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) +{ + + if (!pti) + return; + VM_OBJECT_WLOCK(pti_obj); + pmap_pti_add_kva_locked(sva, eva, exec); + VM_OBJECT_WUNLOCK(pti_obj); +} + +void +pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) +{ + pt_entry_t *pte; + vm_offset_t va; + + if (!pti) + return; + sva = rounddown2(sva, PAGE_SIZE); + MPASS(sva > VM_MAXUSER_ADDRESS); + eva = roundup2(eva, PAGE_SIZE); + MPASS(sva < eva); + VM_OBJECT_WLOCK(pti_obj); + for (va = sva; va < eva; va += PAGE_SIZE) { + pte = pmap_pti_pte(va, NULL); + KASSERT((*pte & X86_PG_V) != 0, + ("invalid pte va %#lx pte %#lx pt %#lx", va, + (u_long)pte, *pte)); + pte_clear(pte); + pmap_pti_unwire_pte(pte, va); + } + pmap_invalidate_range(kernel_pmap, sva, eva); + VM_OBJECT_WUNLOCK(pti_obj); +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index e7af5d7..f6be94e 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -33,6 +33,7 @@ #include "opt_ddb.h" #include +#include #include #include "assym.s" @@ -787,3 +788,115 @@ msr_onfault: movl $EFAULT,%eax POP_FRAME_POINTER ret + +/* + * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3); + * Invalidates address space addressed by ucr3, then returns to kcr3. + * Done in assembler to ensure no other memory accesses happen while + * on ucr3. + */ + ALIGN_TEXT +ENTRY(pmap_pti_pcid_invalidate) + pushfq + cli + movq %rdi,%cr3 /* to user page table */ + movq %rsi,%cr3 /* back to kernel */ + popfq + retq + +/* + * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va); + * Invalidates virtual address va in address space ucr3, then returns to kcr3. + */ + ALIGN_TEXT +ENTRY(pmap_pti_pcid_invlpg) + pushfq + cli + movq %rdi,%cr3 /* to user page table */ + invlpg (%rdx) + movq %rsi,%cr3 /* back to kernel */ + popfq + retq + +/* + * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva, + * vm_offset_t eva); + * Invalidates virtual addresses between sva and eva in address space ucr3, + * then returns to kcr3. + */ + ALIGN_TEXT +ENTRY(pmap_pti_pcid_invlrng) + pushfq + cli + movq %rdi,%cr3 /* to user page table */ +1: invlpg (%rdx) + addq $PAGE_SIZE,%rdx + cmpq %rdx,%rcx + ja 1b + movq %rsi,%cr3 /* back to kernel */ + popfq + retq + + .altmacro + .macro ibrs_seq_label l +handle_ibrs_\l: + .endm + .macro ibrs_call_label l + call handle_ibrs_\l + .endm + .macro ibrs_seq count + ll=1 + .rept \count + ibrs_call_label %(ll) + nop + ibrs_seq_label %(ll) + addq $8,%rsp + ll=ll+1 + .endr + .endm + +/* all callers already saved %rax, %rdx, and %rcx */ +ENTRY(handle_ibrs_entry) + cmpb $0,hw_ibrs_active(%rip) + je 1f + movl $MSR_IA32_SPEC_CTRL,%ecx + movl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax + movl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx + wrmsr + movb $1,PCPU(IBPB_SET) + testl $CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip) + jne 1f + ibrs_seq 32 +1: ret +END(handle_ibrs_entry) + +ENTRY(handle_ibrs_exit) + cmpb $0,PCPU(IBPB_SET) + je 1f + movl $MSR_IA32_SPEC_CTRL,%ecx + xorl %eax,%eax + xorl %edx,%edx + wrmsr + movb $0,PCPU(IBPB_SET) +1: ret +END(handle_ibrs_exit) + +/* registers-neutral version, but needs stack */ +ENTRY(handle_ibrs_exit_rs) + cmpb $0,PCPU(IBPB_SET) + je 1f + pushq %rax + pushq %rdx + pushq %rcx + movl $MSR_IA32_SPEC_CTRL,%ecx + xorl %eax,%eax + xorl %edx,%edx + wrmsr + popq %rcx + popq %rdx + popq %rax + movb $0,PCPU(IBPB_SET) +1: ret +END(handle_ibrs_exit_rs) + + .noaltmacro diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c index 24009db..8867aed 100644 --- a/sys/amd64/amd64/sys_machdep.c +++ b/sys/amd64/amd64/sys_machdep.c @@ -357,7 +357,9 @@ amd64_set_ioperm(td, uap) pcb = td->td_pcb; if (pcb->pcb_tssp == NULL) { tssp = (struct amd64tss *)kmem_malloc(kernel_arena, - ctob(IOPAGES+1), M_WAITOK); + ctob(IOPAGES + 1), M_WAITOK); + pmap_pti_add_kva((vm_offset_t)tssp, (vm_offset_t)tssp + + ctob(IOPAGES + 1), false); iomap = (char *)&tssp[1]; memset(iomap, 0xff, IOPERM_BITMAP_SIZE); critical_enter(); @@ -452,6 +454,8 @@ user_ldt_alloc(struct proc *p, int force) struct proc_ldt *pldt, *new_ldt; struct mdproc *mdp; struct soft_segment_descriptor sldt; + vm_offset_t sva; + vm_size_t sz; mtx_assert(&dt_lock, MA_OWNED); mdp = &p->p_md; @@ -459,13 +463,13 @@ user_ldt_alloc(struct proc *p, int force) return (mdp->md_ldt); mtx_unlock(&dt_lock); new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK); - new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena, - max_ldt_segment * sizeof(struct user_segment_descriptor), - M_WAITOK | M_ZERO); + sz = max_ldt_segment * sizeof(struct user_segment_descriptor); + sva = kmem_malloc(kernel_arena, sz, M_WAITOK | M_ZERO); + new_ldt->ldt_base = (caddr_t)sva; + pmap_pti_add_kva(sva, sva + sz, false); new_ldt->ldt_refcnt = 1; - sldt.ssd_base = (uint64_t)new_ldt->ldt_base; - sldt.ssd_limit = max_ldt_segment * - sizeof(struct user_segment_descriptor) - 1; + sldt.ssd_base = sva; + sldt.ssd_limit = sz - 1; sldt.ssd_type = SDT_SYSLDT; sldt.ssd_dpl = SEL_KPL; sldt.ssd_p = 1; @@ -475,8 +479,8 @@ user_ldt_alloc(struct proc *p, int force) mtx_lock(&dt_lock); pldt = mdp->md_ldt; if (pldt != NULL && !force) { - kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base, - max_ldt_segment * sizeof(struct user_segment_descriptor)); + pmap_pti_remove_kva(sva, sva + sz); + kmem_free(kernel_arena, sva, sz); free(new_ldt, M_SUBPROC); return (pldt); } @@ -518,10 +522,14 @@ user_ldt_free(struct thread *td) static void user_ldt_derefl(struct proc_ldt *pldt) { + vm_offset_t sva; + vm_size_t sz; if (--pldt->ldt_refcnt == 0) { - kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base, - max_ldt_segment * sizeof(struct user_segment_descriptor)); + sva = (vm_offset_t)pldt->ldt_base; + sz = max_ldt_segment * sizeof(struct user_segment_descriptor); + pmap_pti_remove_kva(sva, sva + sz); + kmem_free(kernel_arena, sva, sz); free(pldt, M_SUBPROC); } } diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index e779ef2..fccd297 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -218,11 +218,6 @@ trap(struct trapframe *frame) #endif } - if (type == T_MCHK) { - mca_intr(); - goto out; - } - if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled @@ -452,9 +447,28 @@ trap(struct trapframe *frame) * problem here and not have to check all the * selectors and pointers when the user changes * them. + * + * In case of PTI, the IRETQ faulted while the + * kernel used the pti stack, and exception + * frame records %rsp value pointing to that + * stack. If we return normally to + * doreti_iret_fault, the trapframe is + * reconstructed on pti stack, and calltrap() + * called on it as well. Due to the very + * limited pti stack size, kernel does not + * survive for too long. Switch to the normal + * thread stack for the trap handling. + * + * Magic '5' is the number of qwords occupied by + * the hardware trap frame. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; + if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR( + pti_stack) + (PC_PTI_STACK_SZ - 5) * + sizeof(register_t)) + frame->tf_rsp = PCPU_GET(rsp0) - 5 * + sizeof(register_t); goto out; } if (frame->tf_rip == (long)ld_ds) { @@ -694,6 +708,17 @@ trap_pfault(frame, usermode) } /* + * If nx protection of the usermode portion of kernel page + * tables caused trap, panic. + */ + if (pti && usermode && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | + PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && + (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)== + (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)) + panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid, + p->p_comm, frame->tf_err); + + /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index 20c7cce..8846eb8 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -339,6 +339,8 @@ cpu_thread_clean(struct thread *td) * Clean TSS/iomap */ if (pcb->pcb_tssp != NULL) { + pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp, + (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1)); kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp, ctob(IOPAGES + 1)); pcb->pcb_tssp = NULL; diff --git a/sys/amd64/ia32/ia32_exception.S b/sys/amd64/ia32/ia32_exception.S index fe1a676..1f09764 100644 --- a/sys/amd64/ia32/ia32_exception.S +++ b/sys/amd64/ia32/ia32_exception.S @@ -40,24 +40,27 @@ * that it originated in supervisor mode and skip the swapgs. */ SUPERALIGN_TEXT +IDTVEC(int0x80_syscall_pti) + PTI_UENTRY has_err=0 + jmp int0x80_syscall_common + SUPERALIGN_TEXT IDTVEC(int0x80_syscall) swapgs +int0x80_syscall_common: pushq $2 /* sizeof "int 0x80" */ subq $TF_ERR,%rsp /* skip over tf_trapno */ movq %rdi,TF_RDI(%rsp) movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) - sti - movq %rsi,TF_RSI(%rsp) + SAVE_SEGS + movq %rax,TF_RAX(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) + call handle_ibrs_entry + sti + movq %rsi,TF_RSI(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) - movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) diff --git a/sys/amd64/ia32/ia32_syscall.c b/sys/amd64/ia32/ia32_syscall.c index 6e96edd..c2bf2fb 100644 --- a/sys/amd64/ia32/ia32_syscall.c +++ b/sys/amd64/ia32/ia32_syscall.c @@ -93,7 +93,8 @@ __FBSDID("$FreeBSD$"); #define IDTVEC(name) __CONCAT(X,name) -extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd); +extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti), + IDTVEC(rsvd), IDTVEC(rsvd_pti); void ia32_syscall(struct trapframe *frame); /* Called from asm code */ @@ -205,14 +206,16 @@ static void ia32_syscall_enable(void *dummy) { - setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); + setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) : + &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); } static void ia32_syscall_disable(void *dummy) { - setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), + SDT_SYSIGT, SEL_KPL, 0); } SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL); diff --git a/sys/amd64/include/asmacros.h b/sys/amd64/include/asmacros.h index d5652c4..cd7acd8 100644 --- a/sys/amd64/include/asmacros.h +++ b/sys/amd64/include/asmacros.h @@ -1,7 +1,15 @@ +/* -*- mode: asm -*- */ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * + * Copyright (c) 2018 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -144,70 +152,135 @@ #ifdef LOCORE /* + * Access per-CPU data. + */ +#define PCPU(member) %gs:PC_ ## member +#define PCPU_ADDR(member, reg) \ + movq %gs:PC_PRVSPACE, reg ; \ + addq $PC_ ## member, reg + +/* * Convenience macro for declaring interrupt entry points. */ #define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \ .type __CONCAT(X,name),@function; __CONCAT(X,name): -/* - * Macros to create and destroy a trap frame. - */ -#define PUSH_FRAME \ - subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \ - testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ - jz 1f ; /* Yes, dont swapgs again */ \ - swapgs ; \ -1: movq %rdi,TF_RDI(%rsp) ; \ - movq %rsi,TF_RSI(%rsp) ; \ - movq %rdx,TF_RDX(%rsp) ; \ - movq %rcx,TF_RCX(%rsp) ; \ - movq %r8,TF_R8(%rsp) ; \ - movq %r9,TF_R9(%rsp) ; \ - movq %rax,TF_RAX(%rsp) ; \ - movq %rbx,TF_RBX(%rsp) ; \ - movq %rbp,TF_RBP(%rsp) ; \ - movq %r10,TF_R10(%rsp) ; \ - movq %r11,TF_R11(%rsp) ; \ - movq %r12,TF_R12(%rsp) ; \ - movq %r13,TF_R13(%rsp) ; \ - movq %r14,TF_R14(%rsp) ; \ - movq %r15,TF_R15(%rsp) ; \ - movw %fs,TF_FS(%rsp) ; \ - movw %gs,TF_GS(%rsp) ; \ - movw %es,TF_ES(%rsp) ; \ - movw %ds,TF_DS(%rsp) ; \ - movl $TF_HASSEGS,TF_FLAGS(%rsp) ; \ + .macro SAVE_SEGS + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) + .endm + + .macro MOVE_STACKS qw + .L.offset=0 + .rept \qw + movq .L.offset(%rsp),%rdx + movq %rdx,.L.offset(%rax) + .L.offset=.L.offset+8 + .endr + .endm + + .macro PTI_UUENTRY has_err + movq PCPU(KCR3),%rax + movq %rax,%cr3 + movq PCPU(RSP0),%rax + subq $PTI_SIZE,%rax + MOVE_STACKS ((PTI_SIZE / 8) - 1 + \has_err) + movq %rax,%rsp + popq %rdx + popq %rax + .endm + + .macro PTI_UENTRY has_err + swapgs + pushq %rax + pushq %rdx + PTI_UUENTRY \has_err + .endm + + .macro PTI_ENTRY name, cont, has_err=0 + ALIGN_TEXT + .globl X\name\()_pti + .type X\name\()_pti,@function +X\name\()_pti: + /* %rax, %rdx and possibly err not yet pushed */ + testb $SEL_RPL_MASK,PTI_CS-(2+1-\has_err)*8(%rsp) + jz \cont + PTI_UENTRY \has_err + swapgs + jmp \cont + .endm + + .macro PTI_INTRENTRY vec_name + SUPERALIGN_TEXT + .globl X\vec_name\()_pti + .type X\vec_name\()_pti,@function +X\vec_name\()_pti: + testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* err, %rax, %rdx not pushed */ + jz \vec_name\()_u + PTI_UENTRY has_err=0 + jmp \vec_name\()_u + .endm + + .macro INTR_PUSH_FRAME vec_name + SUPERALIGN_TEXT + .globl X\vec_name + .type X\vec_name,@function +X\vec_name: + testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* come from kernel? */ + jz \vec_name\()_u /* Yes, dont swapgs again */ + swapgs +\vec_name\()_u: + subq $TF_RIP,%rsp /* skip dummy tf_err and tf_trapno */ + movq %rdi,TF_RDI(%rsp) + movq %rsi,TF_RSI(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) + movq %r8,TF_R8(%rsp) + movq %r9,TF_R9(%rsp) + movq %rax,TF_RAX(%rsp) + movq %rbx,TF_RBX(%rsp) + movq %rbp,TF_RBP(%rsp) + movq %r10,TF_R10(%rsp) + movq %r11,TF_R11(%rsp) + movq %r12,TF_R12(%rsp) + movq %r13,TF_R13(%rsp) + movq %r14,TF_R14(%rsp) + movq %r15,TF_R15(%rsp) + SAVE_SEGS + movl $TF_HASSEGS,TF_FLAGS(%rsp) cld + testb $SEL_RPL_MASK,TF_CS(%rsp) /* come from kernel ? */ + jz 1f /* yes, leave PCB_FULL_IRET alone */ + movq PCPU(CURPCB),%r8 + andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) +1: + .endm -#define POP_FRAME \ - movq TF_RDI(%rsp),%rdi ; \ - movq TF_RSI(%rsp),%rsi ; \ - movq TF_RDX(%rsp),%rdx ; \ - movq TF_RCX(%rsp),%rcx ; \ - movq TF_R8(%rsp),%r8 ; \ - movq TF_R9(%rsp),%r9 ; \ - movq TF_RAX(%rsp),%rax ; \ - movq TF_RBX(%rsp),%rbx ; \ - movq TF_RBP(%rsp),%rbp ; \ - movq TF_R10(%rsp),%r10 ; \ - movq TF_R11(%rsp),%r11 ; \ - movq TF_R12(%rsp),%r12 ; \ - movq TF_R13(%rsp),%r13 ; \ - movq TF_R14(%rsp),%r14 ; \ - movq TF_R15(%rsp),%r15 ; \ - testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ - jz 1f ; /* keep kernel GS.base */ \ - cli ; \ - swapgs ; \ -1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ + .macro INTR_HANDLER vec_name + .text + PTI_INTRENTRY \vec_name + INTR_PUSH_FRAME \vec_name + .endm -/* - * Access per-CPU data. - */ -#define PCPU(member) %gs:PC_ ## member -#define PCPU_ADDR(member, reg) \ - movq %gs:PC_PRVSPACE, reg ; \ - addq $PC_ ## member, reg + .macro RESTORE_REGS + movq TF_RDI(%rsp),%rdi + movq TF_RSI(%rsp),%rsi + movq TF_RDX(%rsp),%rdx + movq TF_RCX(%rsp),%rcx + movq TF_R8(%rsp),%r8 + movq TF_R9(%rsp),%r9 + movq TF_RAX(%rsp),%rax + movq TF_RBX(%rsp),%rbx + movq TF_RBP(%rsp),%rbp + movq TF_R10(%rsp),%r10 + movq TF_R11(%rsp),%r11 + movq TF_R12(%rsp),%r12 + movq TF_R13(%rsp),%r13 + movq TF_R14(%rsp),%r14 + movq TF_R15(%rsp),%r15 + .endm #endif /* LOCORE */ diff --git a/sys/amd64/include/frame.h b/sys/amd64/include/frame.h index 0953be7..f0a6fcf 100644 --- a/sys/amd64/include/frame.h +++ b/sys/amd64/include/frame.h @@ -1,6 +1,50 @@ /*- - * This file is in the public domain. + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ */ -/* $FreeBSD$ */ + +#ifndef _AMD64_FRAME_H +#define _AMD64_FRAME_H #include + +struct pti_frame { + register_t pti_rdx; + register_t pti_rax; + register_t pti_err; + register_t pti_rip; + register_t pti_cs; + register_t pti_rflags; + register_t pti_rsp; + register_t pti_ss; +}; + +#endif diff --git a/sys/amd64/include/intr_machdep.h b/sys/amd64/include/intr_machdep.h index e7320e6..29c20b6 100644 --- a/sys/amd64/include/intr_machdep.h +++ b/sys/amd64/include/intr_machdep.h @@ -136,7 +136,7 @@ struct trapframe; /* * The following data structure holds per-cpu data, and is placed just - * above the top of the space used for the NMI stack. + * above the top of the space used for the NMI and MC# stacks. */ struct nmi_pcpu { register_t np_pcpu; diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index e4c50eb..b81f497 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -35,9 +35,17 @@ #include extern uint64_t *vm_page_dump; +extern int hw_ibrs_disable; + +/* + * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its + * value is the physical address at which the kernel is loaded. + */ +extern char kernphys[]; struct savefpu; +void amd64_conf_fast_syscall(void); void amd64_db_resume_dbreg(void); void amd64_syscall(struct thread *td, int traced); void doreti_iret(void) __asm(__STRING(doreti_iret)); diff --git a/sys/amd64/include/pcb.h b/sys/amd64/include/pcb.h index 8078073..2b7bb6e 100644 --- a/sys/amd64/include/pcb.h +++ b/sys/amd64/include/pcb.h @@ -90,7 +90,7 @@ struct pcb { /* copyin/out fault recovery */ caddr_t pcb_onfault; - uint64_t pcb_pad0; + uint64_t pcb_saved_ucr3; /* local tss, with i/o bitmap; NULL for common */ struct amd64tss *pcb_tssp; diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index a4f4e1d..e40c521 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -33,6 +33,7 @@ #error "sys/cdefs.h is a prerequisite for this file" #endif +#define PC_PTI_STACK_SZ 16 /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. @@ -46,8 +47,12 @@ struct pmap *pc_curpmap; \ struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \ struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \ + uint64_t pc_kcr3; \ + uint64_t pc_ucr3; \ + uint64_t pc_saved_ucr3; \ register_t pc_rsp0; \ register_t pc_scratch_rsp; /* User %rsp in syscall */ \ + register_t pc_scratch_rax; \ u_int pc_apic_id; \ u_int pc_acpi_id; /* ACPI CPU id */ \ /* Pointer to the CPU %fs descriptor */ \ @@ -61,12 +66,14 @@ uint64_t pc_pm_save_cnt; \ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ uint64_t pc_dbreg[16]; /* ddb debugging regs */ \ + uint64_t pc_pti_stack[PC_PTI_STACK_SZ]; \ int pc_dbreg_cmd; /* ddb debugging reg cmd */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ uint32_t pc_pcid_next; \ uint32_t pc_pcid_gen; \ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ - char __pad[145] /* be divisor of PAGE_SIZE \ + uint32_t pc_ibpb_set; \ + char __pad[96] /* be divisor of PAGE_SIZE \ after cache alignment */ #define PC_DBREG_CMD_NONE 0 diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index a0b8ee3..acf0301 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -223,6 +223,10 @@ #define PMAP_PCID_NONE 0xffffffff #define PMAP_PCID_KERN 0 #define PMAP_PCID_OVERMAX 0x1000 +#define PMAP_PCID_OVERMAX_KERN 0x800 +#define PMAP_PCID_USER_PT 0x800 + +#define PMAP_NO_CR3 (~0UL) #ifndef LOCORE @@ -313,7 +317,9 @@ struct pmap_pcids { struct pmap { struct mtx pm_mtx; pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ + pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */ uint64_t pm_cr3; + uint64_t pm_ucr3; TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ enum pmap_type pm_type; /* regular or nested tables */ @@ -419,6 +425,12 @@ void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec); +void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva); +void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3); +void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va); +void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva, + vm_offset_t eva); #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index d97c730..64135bc 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -28,12 +28,36 @@ extern u_int32_t mptramp_pagetables; /* IPI handlers */ inthand_t + IDTVEC(justreturn), /* interrupt CPU with minimum overhead */ + IDTVEC(justreturn1_pti), + IDTVEC(invltlb_pti), + IDTVEC(invltlb_pcid_pti), IDTVEC(invltlb_pcid), /* TLB shootdowns - global, pcid */ - IDTVEC(invltlb_invpcid),/* TLB shootdowns - global, invpcid */ - IDTVEC(justreturn); /* interrupt CPU with minimum overhead */ + IDTVEC(invltlb_invpcid_pti_pti), + IDTVEC(invltlb_invpcid_nopti), + IDTVEC(invlpg_pti), + IDTVEC(invlpg_invpcid_pti), + IDTVEC(invlpg_invpcid), + IDTVEC(invlpg_pcid_pti), + IDTVEC(invlpg_pcid), + IDTVEC(invlrng_pti), + IDTVEC(invlrng_invpcid_pti), + IDTVEC(invlrng_invpcid), + IDTVEC(invlrng_pcid_pti), + IDTVEC(invlrng_pcid), + IDTVEC(invlcache_pti), + IDTVEC(ipi_intr_bitmap_handler_pti), + IDTVEC(cpustop_pti), + IDTVEC(cpususpend_pti), + IDTVEC(rendezvous_pti); void invltlb_pcid_handler(void); void invltlb_invpcid_handler(void); +void invltlb_invpcid_pti_handler(void); +void invlpg_invpcid_handler(void); +void invlpg_pcid_handler(void); +void invlrng_invpcid_handler(void); +void invlrng_pcid_handler(void); int native_start_all_aps(void); #endif /* !LOCORE */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 517a374..0edfe51 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -693,7 +693,8 @@ vmx_init(int ipinum) MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, &tmp); if (error == 0) { - pirvec = lapic_ipi_alloc(&IDTVEC(justreturn)); + pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); if (pirvec < 0) { if (bootverbose) { printf("vmx_init: unable to allocate " diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 537454a..2118c13 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -325,7 +326,8 @@ vmm_init(void) vmm_host_state_init(); - vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn)); + vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; diff --git a/sys/conf/Makefile.amd64 b/sys/conf/Makefile.amd64 index 696ef55..9c10c77 100644 --- a/sys/conf/Makefile.amd64 +++ b/sys/conf/Makefile.amd64 @@ -39,6 +39,7 @@ CFLAGS+= -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer ASM_CFLAGS.acpi_wakecode.S= ${CLANG_NO_IAS34} ASM_CFLAGS.mpboot.S= ${CLANG_NO_IAS34} +ASM_CFLAGS.support.S= ${CLANG_NO_IAS} %BEFORE_DEPEND diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh index 0a0893e..2341b62 100644 --- a/sys/conf/newvers.sh +++ b/sys/conf/newvers.sh @@ -44,7 +44,7 @@ TYPE="FreeBSD" REVISION="11.1" -BRANCH="RELEASE-p7" +BRANCH="RELEASE-p8" if [ -n "${BRANCH_OVERRIDE}" ]; then BRANCH=${BRANCH_OVERRIDE} fi diff --git a/sys/dev/cpuctl/cpuctl.c b/sys/dev/cpuctl/cpuctl.c index 5351d8ed..a841400 100644 --- a/sys/dev/cpuctl/cpuctl.c +++ b/sys/dev/cpuctl/cpuctl.c @@ -71,6 +71,7 @@ static int cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data, struct thread *td); static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data, struct thread *td); +static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td); static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td); static int update_intel(int cpu, cpuctl_update_args_t *args, @@ -157,7 +158,8 @@ cpuctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } /* Require write flag for "write" requests. */ if ((cmd == CPUCTL_MSRCBIT || cmd == CPUCTL_MSRSBIT || - cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR) && + cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR || + cmd == CPUCTL_EVAL_CPU_FEATURES) && (flags & FWRITE) == 0) return (EPERM); switch (cmd) { @@ -185,6 +187,9 @@ cpuctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, ret = cpuctl_do_cpuid_count(cpu, (cpuctl_cpuid_count_args_t *)data, td); break; + case CPUCTL_EVAL_CPU_FEATURES: + ret = cpuctl_do_eval_cpu_features(cpu, td); + break; default: ret = EINVAL; break; @@ -502,6 +507,30 @@ fail: return (ret); } +static int +cpuctl_do_eval_cpu_features(int cpu, struct thread *td) +{ + int is_bound = 0; + int oldcpu; + + KASSERT(cpu >= 0 && cpu <= mp_maxid, + ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); + +#ifdef __i386__ + if (cpu_id == 0) + return (ENODEV); +#endif + oldcpu = td->td_oncpu; + is_bound = cpu_sched_is_bound(td); + set_cpu(cpu, td); + identify_cpu1(); + identify_cpu2(); + hw_ibrs_recalculate(); + restore_cpu(oldcpu, is_bound, td); + printcpuinfo(); + return (0); +} + int cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td) { diff --git a/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S index 8d09e24..6e396f3 100644 --- a/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S +++ b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S @@ -26,19 +26,18 @@ * $FreeBSD$ */ +#include "assym.s" + #include #include -#include "assym.s" - /* * This is the Hyper-V vmbus channel direct callback interrupt. * Only used when it is running on Hyper-V. */ .text SUPERALIGN_TEXT -IDTVEC(vmbus_isr) - PUSH_FRAME + INTR_HANDLER vmbus_isr FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call vmbus_handle_intr diff --git a/sys/dev/hyperv/vmbus/i386/vmbus_vector.S b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S index b9ea849..9e28ef6 100644 --- a/sys/dev/hyperv/vmbus/i386/vmbus_vector.S +++ b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S @@ -37,6 +37,7 @@ */ .text SUPERALIGN_TEXT +IDTVEC(vmbus_isr_pti) IDTVEC(vmbus_isr) PUSH_FRAME SET_KERNEL_SREGS diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c index 9999901..c0faada 100644 --- a/sys/dev/hyperv/vmbus/vmbus.c +++ b/sys/dev/hyperv/vmbus/vmbus.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -128,7 +129,7 @@ static void vmbus_event_proc_dummy(struct vmbus_softc *, static struct vmbus_softc *vmbus_sc; -extern inthand_t IDTVEC(vmbus_isr); +extern inthand_t IDTVEC(vmbus_isr), IDTVEC(vmbus_isr_pti); static const uint32_t vmbus_version[] = { VMBUS_VERSION_WIN8_1, @@ -928,7 +929,8 @@ vmbus_intr_setup(struct vmbus_softc *sc) * All Hyper-V ISR required resources are setup, now let's find a * free IDT vector for Hyper-V ISR and set it up. */ - sc->vmbus_idtvec = lapic_ipi_alloc(IDTVEC(vmbus_isr)); + sc->vmbus_idtvec = lapic_ipi_alloc(pti ? IDTVEC(vmbus_isr_pti) : + IDTVEC(vmbus_isr)); if (sc->vmbus_idtvec < 0) { device_printf(sc->vmbus_dev, "cannot find free IDT vector\n"); return ENXIO; diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index 9d56b93..944a236 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -70,6 +70,7 @@ as_lapic_eoi: #define ISR_VEC(index, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ +IDTVEC(vec_name ## _pti) ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ SET_KERNEL_SREGS ; \ @@ -123,6 +124,7 @@ IDTVEC(spuriousint) */ .text SUPERALIGN_TEXT +IDTVEC(timerint_pti) IDTVEC(timerint) PUSH_FRAME SET_KERNEL_SREGS @@ -139,6 +141,7 @@ IDTVEC(timerint) */ .text SUPERALIGN_TEXT +IDTVEC(cmcint_pti) IDTVEC(cmcint) PUSH_FRAME SET_KERNEL_SREGS @@ -153,6 +156,7 @@ IDTVEC(cmcint) */ .text SUPERALIGN_TEXT +IDTVEC(errorint_pti) IDTVEC(errorint) PUSH_FRAME SET_KERNEL_SREGS diff --git a/sys/i386/i386/atpic_vector.s b/sys/i386/i386/atpic_vector.s index a477aee..a7b8894 100644 --- a/sys/i386/i386/atpic_vector.s +++ b/sys/i386/i386/atpic_vector.s @@ -46,6 +46,7 @@ #define INTR(irq_num, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ +IDTVEC(vec_name ##_pti) ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ SET_KERNEL_SREGS ; \ diff --git a/sys/i386/i386/exception.s b/sys/i386/i386/exception.s index 73c67fe..362aa2c 100644 --- a/sys/i386/i386/exception.s +++ b/sys/i386/i386/exception.s @@ -133,6 +133,7 @@ IDTVEC(page) TRAP(T_PAGEFLT) IDTVEC(mchk) pushl $0; TRAP(T_MCHK) +IDTVEC(rsvd_pti) IDTVEC(rsvd) pushl $0; TRAP(T_RESERVED) IDTVEC(fpu) diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index fcc804c..24c1fe8 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -2577,7 +2577,7 @@ init386(int first) GSEL(GCODE_SEL, SEL_KPL)); #endif #ifdef XENHVM - setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL, + setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 94b0d17..0d3fc91 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -283,6 +283,8 @@ SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; +int pti; + static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); @@ -1043,7 +1045,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } - smp_masked_invlpg(*mask, va); + smp_masked_invlpg(*mask, va, pmap); sched_unpin(); } @@ -1077,7 +1079,7 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } - smp_masked_invlpg_range(*mask, sva, eva); + smp_masked_invlpg_range(*mask, sva, eva, pmap); sched_unpin(); } diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index d569970..feffc15 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -830,3 +830,11 @@ msr_onfault: movl $0,PCB_ONFAULT(%ecx) movl $EFAULT,%eax ret + +ENTRY(handle_ibrs_entry) + ret +END(handle_ibrs_entry) + +ENTRY(handle_ibrs_exit) + ret +END(handle_ibrs_exit) diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index 2b35f5c..64577c1 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -795,7 +795,7 @@ sf_buf_shootdown(struct sf_buf *sf, int flags) CPU_NAND(&other_cpus, &sf->cpumask); if (!CPU_EMPTY(&other_cpus)) { CPU_OR(&sf->cpumask, &other_cpus); - smp_masked_invlpg(other_cpus, sf->kva); + smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap); } } sched_unpin(); diff --git a/sys/sys/cpuctl.h b/sys/sys/cpuctl.h index 30af524..65556ec 100644 --- a/sys/sys/cpuctl.h +++ b/sys/sys/cpuctl.h @@ -57,5 +57,6 @@ typedef struct { #define CPUCTL_MSRSBIT _IOWR('c', 5, cpuctl_msr_args_t) #define CPUCTL_MSRCBIT _IOWR('c', 6, cpuctl_msr_args_t) #define CPUCTL_CPUID_COUNT _IOWR('c', 7, cpuctl_cpuid_count_args_t) +#define CPUCTL_EVAL_CPU_FEATURES _IO('c', 8) #endif /* _CPUCTL_H_ */ diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h index ba3a237..cd94ed5 100644 --- a/sys/x86/include/apicvar.h +++ b/sys/x86/include/apicvar.h @@ -179,7 +179,11 @@ inthand_t IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), - IDTVEC(spuriousint), IDTVEC(timerint); + IDTVEC(spuriousint), IDTVEC(timerint), + IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti), + IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti), + IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti), + IDTVEC(spuriousint_pti), IDTVEC(timerint_pti); extern vm_paddr_t lapic_paddr; extern int apic_cpuids[]; diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h index 10bc4e7b..04b2489 100644 --- a/sys/x86/include/specialreg.h +++ b/sys/x86/include/specialreg.h @@ -374,6 +374,17 @@ #define CPUID_STDEXT2_SGXLC 0x40000000 /* + * CPUID instruction 7 Structured Extended Features, leaf 0 edx info + */ +#define CPUID_STDEXT3_IBPB 0x04000000 +#define CPUID_STDEXT3_STIBP 0x08000000 +#define CPUID_STDEXT3_ARCH_CAP 0x20000000 + +/* MSR IA32_ARCH_CAP(ABILITIES) bits */ +#define IA32_ARCH_CAP_RDCL_NO 0x00000001 +#define IA32_ARCH_CAP_IBRS_ALL 0x00000002 + +/* * CPUID manufacturers identifiers */ #define AMD_VENDOR_ID "AuthenticAMD" @@ -401,6 +412,8 @@ #define MSR_EBL_CR_POWERON 0x02a #define MSR_TEST_CTL 0x033 #define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_IA32_SPEC_CTRL 0x048 +#define MSR_IA32_PRED_CMD 0x049 #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 #define MSR_BBL_CR_D1 0x089 @@ -413,6 +426,7 @@ #define MSR_APERF 0x0e8 #define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ #define MSR_MTRRcap 0x0fe +#define MSR_IA32_ARCH_CAP 0x10a #define MSR_BBL_CR_ADDR 0x116 #define MSR_BBL_CR_DECC 0x118 #define MSR_BBL_CR_CTL 0x119 @@ -556,6 +570,17 @@ #define IA32_MISC_EN_XDD 0x0000000400000000ULL /* + * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel' + * document 336996-001 Speculative Execution Side Channel Mitigations. + */ +/* MSR IA32_SPEC_CTRL */ +#define IA32_SPEC_CTRL_IBRS 0x00000001 +#define IA32_SPEC_CTRL_STIBP 0x00000002 + +/* MSR IA32_PRED_CMD */ +#define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL + +/* * PAT modes. */ #define PAT_UNCACHEABLE 0x00 diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h index 84a0eba..8d5980c 100644 --- a/sys/x86/include/x86_smp.h +++ b/sys/x86/include/x86_smp.h @@ -37,6 +37,7 @@ extern int cpu_logical; extern int cpu_cores; extern volatile uint32_t smp_tlb_generation; extern struct pmap *smp_tlb_pmap; +extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2; extern u_int xhits_gbl[]; extern u_int xhits_pg[]; extern u_int xhits_rng[]; @@ -95,9 +96,9 @@ void ipi_selected(cpuset_t cpus, u_int ipi); u_int mp_bootaddress(u_int); void set_interrupt_apic_ids(void); void smp_cache_flush(void); -void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr); +void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap); void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva, - vm_offset_t endva); + vm_offset_t endva, struct pmap *pmap); void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap); void mem_range_AP_init(void); void topo_probe(void); diff --git a/sys/x86/include/x86_var.h b/sys/x86/include/x86_var.h index 92c9f1d..dc7d424 100644 --- a/sys/x86/include/x86_var.h +++ b/sys/x86/include/x86_var.h @@ -50,6 +50,8 @@ extern u_int via_feature_xcrypt; extern u_int cpu_clflush_line_size; extern u_int cpu_stdext_feature; extern u_int cpu_stdext_feature2; +extern u_int cpu_stdext_feature3; +extern uint64_t cpu_ia32_arch_caps; extern u_int cpu_fxsr; extern u_int cpu_high; extern u_int cpu_id; @@ -78,6 +80,7 @@ extern int _ufssel; extern int _ugssel; extern int use_xsave; extern uint64_t xsave_mask; +extern int pti; struct pcb; struct thread; @@ -115,18 +118,24 @@ void cpu_probe_amdc1e(void); void cpu_setregs(void); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); -void identify_cpu(void); +void finishidentcpu(void); +void identify_cpu1(void); +void identify_cpu2(void); void initializecpu(void); void initializecpucache(void); bool fix_cpuid(void); void fillw(int /*u_short*/ pat, void *base, size_t cnt); int is_physical_memory(vm_paddr_t addr); int isa_nmi(int cd); +void handle_ibrs_entry(void); +void handle_ibrs_exit(void); +void hw_ibrs_recalculate(void); void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame); void nmi_call_kdb_smp(u_int type, struct trapframe *frame); void nmi_handle_intr(u_int type, struct trapframe *frame); void pagecopy(void *from, void *to); void printcpuinfo(void); +int pti_get_default(void); int user_dbreg_trap(void); int minidumpsys(struct dumperinfo *); struct pcb *get_pcb_td(struct thread *td); diff --git a/sys/x86/isa/atpic.c b/sys/x86/isa/atpic.c index 43504e7..0364919 100644 --- a/sys/x86/isa/atpic.c +++ b/sys/x86/isa/atpic.c @@ -86,6 +86,16 @@ inthand_t IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11), IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14), IDTVEC(atpic_intr15); +/* XXXKIB i386 uses stubs until pti comes */ +inthand_t + IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti), + IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti), + IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti), + IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti), + IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti), + IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti), + IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti), + IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti); #define IRQ(ap, ai) ((ap)->at_irqbase + (ai)->at_irq) @@ -98,7 +108,7 @@ inthand_t #define INTSRC(irq) \ { { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ), \ - (irq) % 8 } + IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 } struct atpic { struct pic at_pic; @@ -110,7 +120,7 @@ struct atpic { struct atpic_intsrc { struct intsrc at_intsrc; - inthand_t *at_intr; + inthand_t *at_intr, *at_intr_pti; int at_irq; /* Relative to PIC base. */ enum intr_trigger at_trigger; u_long at_count; @@ -435,7 +445,8 @@ atpic_startup(void) ai->at_intsrc.is_count = &ai->at_count; ai->at_intsrc.is_straycount = &ai->at_straycount; setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase + - ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC); + ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC, + SEL_KPL, GSEL_ATPIC); } #ifdef DEV_MCA diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c index c2d42a9..9449d3e 100644 --- a/sys/x86/x86/cpu_machdep.c +++ b/sys/x86/x86/cpu_machdep.c @@ -139,6 +139,12 @@ acpi_cpu_idle_mwait(uint32_t mwait_hint) int *state; /* + * A comment in Linux patch claims that 'CPUs run faster with + * speculation protection disabled. All CPU threads in a core + * must disable speculation protection for it to be + * disabled. Disable it while we are idle so the other + * hyperthread can run fast.' + * * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ @@ -147,9 +153,11 @@ acpi_cpu_idle_mwait(uint32_t mwait_hint) KASSERT(*state == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); *state = STATE_MWAIT; + handle_ibrs_entry(); cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); + handle_ibrs_exit(); /* * We should exit on any event that interrupts mwait, because @@ -578,3 +586,47 @@ nmi_handle_intr(u_int type, struct trapframe *frame) nmi_call_kdb(PCPU_GET(cpuid), type, frame); #endif } + +int hw_ibrs_active; +int hw_ibrs_disable = 1; + +SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0, + "Indirect Branch Restricted Speculation active"); + +void +hw_ibrs_recalculate(void) +{ + uint64_t v; + + if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) { + if (hw_ibrs_disable) { + v= rdmsr(MSR_IA32_SPEC_CTRL); + v &= ~(uint64_t)IA32_SPEC_CTRL_IBRS; + wrmsr(MSR_IA32_SPEC_CTRL, v); + } else { + v= rdmsr(MSR_IA32_SPEC_CTRL); + v |= IA32_SPEC_CTRL_IBRS; + wrmsr(MSR_IA32_SPEC_CTRL, v); + } + return; + } + hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 && + !hw_ibrs_disable; +} + +static int +hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = hw_ibrs_disable; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + hw_ibrs_disable = val != 0; + hw_ibrs_recalculate(); + return (0); +} +SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN | + CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", + "Disable Indirect Branch Restricted Speculation"); diff --git a/sys/x86/x86/identcpu.c b/sys/x86/x86/identcpu.c index 22f3966..dd58037 100644 --- a/sys/x86/x86/identcpu.c +++ b/sys/x86/x86/identcpu.c @@ -104,8 +104,10 @@ u_int cpu_vendor_id; /* CPU vendor ID */ u_int cpu_fxsr; /* SSE enabled */ u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */ u_int cpu_clflush_line_size = 32; -u_int cpu_stdext_feature; -u_int cpu_stdext_feature2; +u_int cpu_stdext_feature; /* %ebx */ +u_int cpu_stdext_feature2; /* %ecx */ +u_int cpu_stdext_feature3; /* %edx */ +uint64_t cpu_ia32_arch_caps; u_int cpu_max_ext_state_size; u_int cpu_mon_mwait_flags; /* MONITOR/MWAIT flags (CPUID.05H.ECX) */ u_int cpu_mon_min_size; /* MONITOR minimum range size, bytes */ @@ -978,6 +980,16 @@ printcpuinfo(void) ); } + if (cpu_stdext_feature3 != 0) { + printf("\n Structured Extended Features3=0x%b", + cpu_stdext_feature3, + "\020" + "\033IBPB" + "\034STIBP" + "\036ARCH_CAP" + ); + } + if ((cpu_feature2 & CPUID2_XSAVE) != 0) { cpuid_count(0xd, 0x1, regs); if (regs[0] != 0) { @@ -991,6 +1003,15 @@ printcpuinfo(void) } } + if (cpu_ia32_arch_caps != 0) { + printf("\n IA32_ARCH_CAPS=0x%b", + (u_int)cpu_ia32_arch_caps, + "\020" + "\001RDCL_NO" + "\002IBRS_ALL" + ); + } + if (via_feature_rng != 0 || via_feature_xcrypt != 0) print_via_padlock_info(); @@ -1370,23 +1391,11 @@ fix_cpuid(void) return (false); } -/* - * Final stage of CPU identification. - */ -#ifdef __i386__ void -finishidentcpu(void) -#else -void -identify_cpu(void) -#endif +identify_cpu1(void) { - u_int regs[4], cpu_stdext_disable; -#ifdef __i386__ - u_char ccr3; -#endif + u_int regs[4]; -#ifdef __amd64__ do_cpuid(0, regs); cpu_high = regs[0]; ((u_int *)&cpu_vendor)[0] = regs[1]; @@ -1399,6 +1408,44 @@ identify_cpu(void) cpu_procinfo = regs[1]; cpu_feature = regs[3]; cpu_feature2 = regs[2]; +} + +void +identify_cpu2(void) +{ + u_int regs[4], cpu_stdext_disable; + + if (cpu_high >= 7) { + cpuid_count(7, 0, regs); + cpu_stdext_feature = regs[1]; + + /* + * Some hypervisors failed to filter out unsupported + * extended features. Allow to disable the + * extensions, activation of which requires setting a + * bit in CR4, and which VM monitors do not support. + */ + cpu_stdext_disable = 0; + TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable); + cpu_stdext_feature &= ~cpu_stdext_disable; + + cpu_stdext_feature2 = regs[2]; + cpu_stdext_feature3 = regs[3]; + + if ((cpu_stdext_feature3 & CPUID_STDEXT3_ARCH_CAP) != 0) + cpu_ia32_arch_caps = rdmsr(MSR_IA32_ARCH_CAP); + } +} + +/* + * Final stage of CPU identification. + */ +void +finishidentcpu(void) +{ + u_int regs[4]; +#ifdef __i386__ + u_char ccr3; #endif identify_hypervisor(); @@ -1416,25 +1463,7 @@ identify_cpu(void) cpu_mon_max_size = regs[1] & CPUID5_MON_MAX_SIZE; } - if (cpu_high >= 7) { - cpuid_count(7, 0, regs); - cpu_stdext_feature = regs[1]; - - /* - * Some hypervisors fail to filter out unsupported - * extended features. For now, disable the - * extensions, activation of which requires setting a - * bit in CR4, and which VM monitors do not support. - */ - if (cpu_feature2 & CPUID2_HV) { - cpu_stdext_disable = CPUID_STDEXT_FSGSBASE | - CPUID_STDEXT_SMEP; - } else - cpu_stdext_disable = 0; - TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable); - cpu_stdext_feature &= ~cpu_stdext_disable; - cpu_stdext_feature2 = regs[2]; - } + identify_cpu2(); #ifdef __i386__ if (cpu_high > 0 && @@ -1563,6 +1592,17 @@ identify_cpu(void) #endif } +int +pti_get_default(void) +{ + + if (strcmp(cpu_vendor, AMD_VENDOR_ID) == 0) + return (0); + if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) != 0) + return (0); + return (1); +} + static u_int find_cpu_vendor_id(void) { diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c index 11041d4..085a28f 100644 --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -166,13 +166,23 @@ static inthand_t *ioint_handlers[] = { IDTVEC(apic_isr7), /* 224 - 255 */ }; +static inthand_t *ioint_pti_handlers[] = { + NULL, /* 0 - 31 */ + IDTVEC(apic_isr1_pti), /* 32 - 63 */ + IDTVEC(apic_isr2_pti), /* 64 - 95 */ + IDTVEC(apic_isr3_pti), /* 96 - 127 */ + IDTVEC(apic_isr4_pti), /* 128 - 159 */ + IDTVEC(apic_isr5_pti), /* 160 - 191 */ + IDTVEC(apic_isr6_pti), /* 192 - 223 */ + IDTVEC(apic_isr7_pti), /* 224 - 255 */ +}; static u_int32_t lapic_timer_divisors[] = { APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16, APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128 }; -extern inthand_t IDTVEC(rsvd); +extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd); volatile char *lapic_map; vm_paddr_t lapic_paddr; @@ -489,15 +499,18 @@ native_lapic_init(vm_paddr_t addr) PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ - setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); + setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint), + SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ - setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); + setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint), + SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ - setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint), + SDT_APICT, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; @@ -1561,8 +1574,8 @@ native_apic_enable_vector(u_int apic_id, u_int vector) KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif - setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL, - GSEL_APIC); + setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32], + SDT_APIC, SEL_KPL, GSEL_APIC); } static void @@ -1581,7 +1594,8 @@ native_apic_disable_vector(u_int apic_id, u_int vector) * We can not currently clear the idt entry because other cpus * may have a valid vector at this offset. */ - setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + SEL_KPL, GSEL_APIC); #endif } @@ -2084,14 +2098,16 @@ native_lapic_ipi_alloc(inthand_t *ipifunc) long func; int idx, vector; - KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc)); + KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti), + ("invalid ipifunc %p", ipifunc)); vector = -1; mtx_lock_spin(&icu_lock); for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) { ip = &idt[idx]; func = (ip->gd_hioffset << 16) | ip->gd_looffset; - if (func == (uintptr_t)&IDTVEC(rsvd)) { + if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) || + (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) { vector = idx; setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC); break; @@ -2113,8 +2129,10 @@ native_lapic_ipi_free(int vector) mtx_lock_spin(&icu_lock); ip = &idt[vector]; func = (ip->gd_hioffset << 16) | ip->gd_looffset; - KASSERT(func != (uintptr_t)&IDTVEC(rsvd), + KASSERT(func != (uintptr_t)&IDTVEC(rsvd) && + func != (uintptr_t)&IDTVEC(rsvd_pti), ("invalid idtfunc %#lx", func)); - setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + SEL_KPL, GSEL_APIC); mtx_unlock_spin(&icu_lock); } diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index 7cc02d6..cd10782 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1436,7 +1436,7 @@ SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); */ /* Variables needed for SMP tlb shootdown. */ -static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; +vm_offset_t smp_tlb_addr1, smp_tlb_addr2; pmap_t smp_tlb_pmap; volatile uint32_t smp_tlb_generation; @@ -1509,11 +1509,11 @@ smp_masked_invltlb(cpuset_t mask, pmap_t pmap) } void -smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) +smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0); + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif @@ -1521,11 +1521,12 @@ smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) } void -smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) +smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, + pmap_t pmap) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL, + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; diff --git a/sys/x86/xen/pv.c b/sys/x86/xen/pv.c index 9ad9aa9..f4b68f0 100644 --- a/sys/x86/xen/pv.c +++ b/sys/x86/xen/pv.c @@ -97,6 +97,7 @@ static int xen_pv_start_all_aps(void); #ifdef SMP /* Variables used by amd64 mp_machdep to start APs */ extern char *doublefault_stack; +extern char *mce_stack; extern char *nmi_stack; #endif @@ -217,6 +218,8 @@ start_xen_ap(int cpu) (void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO); doublefault_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); + mce_stack = + (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = diff --git a/usr.sbin/cpucontrol/cpucontrol.8 b/usr.sbin/cpucontrol/cpucontrol.8 index 91946d3..4af1273 100644 --- a/usr.sbin/cpucontrol/cpucontrol.8 +++ b/usr.sbin/cpucontrol/cpucontrol.8 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 30, 2009 +.Dd January 5, 2018 .Dt CPUCONTROL 8 .Os .Sh NAME @@ -36,44 +36,48 @@ device .Nm .Op Fl vh .Fl m Ar msr -.Bk .Ar device .Ek +.Bk .Nm .Op Fl vh .Fl m Ar msr Ns = Ns Ar value -.Bk .Ar device .Ek +.Bk .Nm .Op Fl vh .Fl m Ar msr Ns &= Ns Ar mask -.Bk .Ar device .Ek +.Bk .Nm .Op Fl vh .Fl m Ar msr Ns |= Ns Ar mask -.Bk .Ar device .Ek +.Bk .Nm .Op Fl vh .Fl i Ar level -.Bk .Ar device .Ek +.Bk .Nm .Op Fl vh .Fl i Ar level,level_type -.Bk .Ar device .Ek +.Bk .Nm .Op Fl vh .Op Fl d Ar datadir .Fl u +.Ar device +.Ek .Bk +.Nm +.Fl e .Ar device .Ek .Sh DESCRIPTION @@ -129,6 +133,20 @@ The .Nm utility will walk through the configured data directories and apply all firmware updates available for this CPU. +.It Fl e +Re-evaluate the kernel flags indicating the present CPU features. +This command is typically executed after a firmware update was applied +which changes information reported by the +.Dv CPUID +instruction. +.Pp +.Bf -symbolic +Only execute the +.Fl e +command after the microcode update was applied to all CPUs in the system. +The kernel does not operate correctly if the features of processors are +not identical. +.Ef .It Fl v Increase the verbosity level. .It Fl h diff --git a/usr.sbin/cpucontrol/cpucontrol.c b/usr.sbin/cpucontrol/cpucontrol.c index 48e12e7..5d7153a 100644 --- a/usr.sbin/cpucontrol/cpucontrol.c +++ b/usr.sbin/cpucontrol/cpucontrol.c @@ -60,6 +60,7 @@ int verbosity_level = 0; #define FLAG_I 0x01 #define FLAG_M 0x02 #define FLAG_U 0x04 +#define FLAG_E 0x10 #define OP_INVAL 0x00 #define OP_READ 0x01 @@ -114,7 +115,7 @@ usage(void) if (name == NULL) name = "cpuctl"; fprintf(stderr, "Usage: %s [-vh] [-d datadir] [-m msr[=value] | " - "-i level | -i level,level_type | -u] device\n", name); + "-i level | -i level,level_type | -e | -u] device\n", name); exit(EX_USAGE); } @@ -338,6 +339,25 @@ do_msr(const char *cmdarg, const char *dev) } static int +do_eval_cpu_features(const char *dev) +{ + int fd, error; + + assert(dev != NULL); + + fd = open(dev, O_RDWR); + if (fd < 0) { + WARN(0, "error opening %s for writing", dev); + return (1); + } + error = ioctl(fd, CPUCTL_EVAL_CPU_FEATURES, NULL); + if (error < 0) + WARN(0, "ioctl(%s, CPUCTL_EVAL_CPU_FEATURES)", dev); + close(fd); + return (error); +} + +static int do_update(const char *dev) { int fd; @@ -431,11 +451,14 @@ main(int argc, char *argv[]) * Add all default data dirs to the list first. */ datadir_add(DEFAULT_DATADIR); - while ((c = getopt(argc, argv, "d:hi:m:uv")) != -1) { + while ((c = getopt(argc, argv, "d:ehi:m:uv")) != -1) { switch (c) { case 'd': datadir_add(optarg); break; + case 'e': + flags |= FLAG_E; + break; case 'i': flags |= FLAG_I; cmdarg = optarg; @@ -464,22 +487,25 @@ main(int argc, char *argv[]) /* NOTREACHED */ } dev = argv[0]; - c = flags & (FLAG_I | FLAG_M | FLAG_U); + c = flags & (FLAG_E | FLAG_I | FLAG_M | FLAG_U); switch (c) { - case FLAG_I: - if (strstr(cmdarg, ",") != NULL) - error = do_cpuid_count(cmdarg, dev); - else - error = do_cpuid(cmdarg, dev); - break; - case FLAG_M: - error = do_msr(cmdarg, dev); - break; - case FLAG_U: - error = do_update(dev); - break; - default: - usage(); /* Only one command can be selected. */ + case FLAG_I: + if (strstr(cmdarg, ",") != NULL) + error = do_cpuid_count(cmdarg, dev); + else + error = do_cpuid(cmdarg, dev); + break; + case FLAG_M: + error = do_msr(cmdarg, dev); + break; + case FLAG_U: + error = do_update(dev); + break; + case FLAG_E: + error = do_eval_cpu_features(dev); + break; + default: + usage(); /* Only one command can be selected. */ } SLIST_FREE(&datadirs, next, free); return (error == 0 ? 0 : 1); -- cgit v1.1