summaryrefslogtreecommitdiffstats
path: root/sys/amd64
diff options
context:
space:
mode:
authorjhb <jhb@FreeBSD.org>2014-06-12 19:58:12 +0000
committerjhb <jhb@FreeBSD.org>2014-06-12 19:58:12 +0000
commit3e1f2ae835422b06a1812ad150561de8d78e7998 (patch)
treea88a56bf8fd9c614d7ab81310cc3f2e44f3a1606 /sys/amd64
parent572e5ac4530c8a38b1fb79a4c94be7f76df567f4 (diff)
downloadFreeBSD-src-3e1f2ae835422b06a1812ad150561de8d78e7998.zip
FreeBSD-src-3e1f2ae835422b06a1812ad150561de8d78e7998.tar.gz
MFC 261638,262144,262506,266765:
Add virtualized XSAVE support to bhyve which permits guests to use XSAVE and XSAVE-enabled features like AVX. - Store a per-cpu guest xcr0 register and handle xsetbv VM exits by emulating the instruction. - Only expose XSAVE to guests if XSAVE is enabled in the host. Only expose a subset of XSAVE features currently supported by the guest and for which the proper emulation of xsetbv is known. Currently this includes X87, SSE, AVX, AVX-512, and Intel MPX. - Add support for injecting hardware exceptions into the guest and use this to trigger exceptions in the guest for invalid xsetbv operations instead of potentially faulting in the host. - Queue pending exceptions in the 'struct vcpu' instead of directly updating the processor-specific VMCS or VMCB. The pending exception will be delivered right before entering the guest. - Rename the unused ioctl VM_INJECT_EVENT to VM_INJECT_EXCEPTION and restrict it to only deliver x86 hardware exceptions. This new ioctl is now used to inject a protection fault when the guest accesses an unimplemented MSR. - Expose a subset of known-safe features from leaf 0 of the structured extended features to guests if they are supported on the host including RDFSBASE/RDGSBASE, BMI1/2, AVX2, AVX-512, HLE, ERMS, and RTM. Aside from AVX-512, these features are all new instructions available for use in ring 3 with no additional hypervisor changes needed.
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/include/vmm.h48
-rw-r--r--sys/amd64/include/vmm_dev.h9
-rw-r--r--sys/amd64/vmm/amd/amdv.c10
-rw-r--r--sys/amd64/vmm/intel/vmcs.h2
-rw-r--r--sys/amd64/vmm/intel/vmx.c155
-rw-r--r--sys/amd64/vmm/vmm.c100
-rw-r--r--sys/amd64/vmm/vmm_dev.c13
-rw-r--r--sys/amd64/vmm/vmm_host.c39
-rw-r--r--sys/amd64/vmm/vmm_host.h8
-rw-r--r--sys/amd64/vmm/x86.c102
10 files changed, 377 insertions, 109 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 8b6933a..e9a3db9 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -34,6 +34,7 @@
#define VM_MAX_NAMELEN 32
struct vm;
+struct vm_exception;
struct vm_memory_segment;
struct seg_desc;
struct vm_exit;
@@ -62,9 +63,6 @@ typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
-typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
- int type, int vector,
- uint32_t code, int code_valid);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
@@ -84,7 +82,6 @@ struct vmm_ops {
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
vmi_set_desc_t vmsetdesc;
- vmi_inject_event_t vminject;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
vmi_vmspace_alloc vmspace_alloc;
@@ -117,8 +114,6 @@ int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc);
int vm_run(struct vm *vm, struct vm_run *vmrun);
-int vm_inject_event(struct vm *vm, int vcpu, int type,
- int vector, uint32_t error_code, int error_code_valid);
int vm_inject_nmi(struct vm *vm, int vcpu);
int vm_nmi_pending(struct vm *vm, int vcpuid);
void vm_nmi_clear(struct vm *vm, int vcpuid);
@@ -192,6 +187,33 @@ void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
struct vmspace *vm_get_vmspace(struct vm *vm);
int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
+
+/*
+ * Inject exception 'vme' into the guest vcpu. This function returns 0 on
+ * success and non-zero on failure.
+ *
+ * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
+ * this function directly because they enforce the trap-like or fault-like
+ * behavior of an exception.
+ *
+ * This function should only be called in the context of the thread that is
+ * executing this vcpu.
+ */
+int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
+
+/*
+ * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
+ * exception is pending and also updates 'vme'. The pending exception is
+ * cleared when this function returns.
+ *
+ * This function should only be called in the context of the thread that is
+ * executing this vcpu.
+ */
+int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
+
+void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
+void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
+
#endif /* KERNEL */
#include <machine/vmm_instruction_emul.h>
@@ -199,20 +221,6 @@ int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
#define VM_MAXCPU 16 /* maximum virtual cpus */
/*
- * Identifiers for events that can be injected into the VM
- */
-enum vm_event_type {
- VM_EVENT_NONE,
- VM_HW_INTR,
- VM_NMI,
- VM_HW_EXCEPTION,
- VM_SW_INTR,
- VM_PRIV_SW_EXCEPTION,
- VM_SW_EXCEPTION,
- VM_EVENT_MAX
-};
-
-/*
* Identifiers for architecturally defined registers.
*/
enum vm_reg_name {
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index bacbebc..c75f5cf 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -58,9 +58,8 @@ struct vm_run {
struct vm_exit vm_exit;
};
-struct vm_event {
+struct vm_exception {
int cpuid;
- enum vm_event_type type;
int vector;
uint32_t error_code;
int error_code_valid;
@@ -174,7 +173,7 @@ enum {
IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
/* interrupt injection */
- IOCNUM_INJECT_EVENT = 30,
+ IOCNUM_INJECT_EXCEPTION = 30,
IOCNUM_LAPIC_IRQ = 31,
IOCNUM_INJECT_NMI = 32,
IOCNUM_IOAPIC_ASSERT_IRQ = 33,
@@ -215,8 +214,8 @@ enum {
_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
#define VM_GET_SEGMENT_DESCRIPTOR \
_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
-#define VM_INJECT_EVENT \
- _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define VM_INJECT_EXCEPTION \
+ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
#define VM_LAPIC_IRQ \
_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
#define VM_LAPIC_LOCAL_IRQ \
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index 00484c7..39f0ef7 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -115,15 +115,6 @@ amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
}
static int
-amdv_inject_event(void *vmi, int vcpu, int type, int vector,
- uint32_t error_code, int error_code_valid)
-{
-
- printf("amdv_inject_event: not implemented\n");
- return (EINVAL);
-}
-
-static int
amdv_getcap(void *arg, int vcpu, int type, int *retval)
{
@@ -180,7 +171,6 @@ struct vmm_ops vmm_ops_amd = {
amdv_setreg,
amdv_getdesc,
amdv_setdesc,
- amdv_inject_event,
amdv_getcap,
amdv_setcap,
amdv_vmspace_alloc,
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 08e07e7..9cde999 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -345,6 +345,8 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */
#define VMCS_INTR_T_HWINTR (0 << 8)
#define VMCS_INTR_T_NMI (2 << 8)
+#define VMCS_INTR_T_HWEXCEPTION (3 << 8)
+#define VMCS_INTR_DEL_ERRCODE (1 << 11)
/*
* VMCS IDT-Vectoring information fields
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 28d6504..2d4f376 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
#include "vmm_host.h"
#include "vmm_ipi.h"
#include "vmm_msr.h"
@@ -1090,10 +1091,27 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
static void
vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
{
+ struct vm_exception exc;
int vector, need_nmi_exiting;
uint64_t rflags;
uint32_t gi, info;
+ if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
+ KASSERT(exc.vector >= 0 && exc.vector < 32,
+ ("%s: invalid exception vector %d", __func__, exc.vector));
+
+ info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+ KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
+ "pending exception %d: %#x", __func__, exc.vector, info));
+
+ info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
+ if (exc.error_code_valid) {
+ info |= VMCS_INTR_DEL_ERRCODE;
+ vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+ }
+ vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+ }
+
if (vm_nmi_pending(vmx->vm, vcpu)) {
/*
* If there are no conditions blocking NMI injection then
@@ -1169,6 +1187,7 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
* This is expected and could happen for multiple reasons:
* - A vectoring VM-entry was aborted due to astpending
* - A VM-exit happened during event injection.
+ * - An exception was injected above.
* - An NMI was injected above or after "NMI window exiting"
*/
VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
@@ -1228,6 +1247,82 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
}
static int
+vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+ struct vmxctx *vmxctx;
+ uint64_t xcrval;
+ const struct xsave_limits *limits;
+
+ vmxctx = &vmx->ctx[vcpu];
+ limits = vmm_get_xsave_limits();
+
+ /*
+ * Note that the processor raises a GP# fault on its own if
+ * xsetbv is executed for CPL != 0, so we do not have to
+ * emulate that fault here.
+ */
+
+ /* Only xcr0 is supported. */
+ if (vmxctx->guest_rcx != 0) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
+ if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
+ vm_inject_ud(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
+ if ((xcrval & ~limits->xcr0_allowed) != 0) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ if (!(xcrval & XFEATURE_ENABLED_X87)) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ /* AVX (YMM_Hi128) requires SSE. */
+ if (xcrval & XFEATURE_ENABLED_AVX &&
+ (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ /*
+ * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
+ * ZMM_Hi256, and Hi16_ZMM.
+ */
+ if (xcrval & XFEATURE_AVX512 &&
+ (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
+ (XFEATURE_AVX512 | XFEATURE_AVX)) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ /*
+ * Intel MPX requires both bound register state flags to be
+ * set.
+ */
+ if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
+ ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ /*
+ * This runs "inside" vmrun() with the guest's FPU state, so
+ * modifying xcr0 directly modifies the guest's xcr0, not the
+ * host's.
+ */
+ load_xcr(0, xcrval);
+ return (HANDLED);
+}
+
+static int
vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
{
int cr, vmcs_guest_cr, vmcs_shadow_cr;
@@ -1413,7 +1508,7 @@ vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
if (!virtual_interrupt_delivery)
return (UNHANDLED);
- handled = 1;
+ handled = HANDLED;
offset = APIC_WRITE_OFFSET(qual);
switch (offset) {
case APIC_OFFSET_ID:
@@ -1435,7 +1530,7 @@ vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
retu = false;
error = vlapic_icrlo_write_handler(vlapic, &retu);
if (error != 0 || retu)
- handled = 0;
+ handled = UNHANDLED;
break;
case APIC_OFFSET_CMCI_LVT:
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
@@ -1448,7 +1543,7 @@ vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
vlapic_dcr_write_handler(vlapic);
break;
default:
- handled = 0;
+ handled = UNHANDLED;
break;
}
return (handled);
@@ -1548,7 +1643,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
- handled = 0;
+ handled = UNHANDLED;
vmxctx = &vmx->ctx[vcpu];
qual = vmexit->u.vmx.exit_qualification;
@@ -1611,7 +1706,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vmexit->exitcode = VM_EXITCODE_RDMSR;
vmexit->u.msr.code = ecx;
} else if (!retu) {
- handled = 1;
+ handled = HANDLED;
} else {
/* Return to userspace with a valid exitcode */
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
@@ -1631,7 +1726,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vmexit->u.msr.code = ecx;
vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
} else if (!retu) {
- handled = 1;
+ handled = HANDLED;
} else {
/* Return to userspace with a valid exitcode */
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
@@ -1773,6 +1868,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vlapic = vm_lapic(vmx->vm, vcpu);
handled = vmx_handle_apic_write(vlapic, qual);
break;
+ case EXIT_REASON_XSETBV:
+ handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
+ break;
default:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
break;
@@ -2198,50 +2296,6 @@ vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
}
static int
-vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
- int code_valid)
-{
- int error;
- uint64_t info;
- struct vmx *vmx = arg;
- struct vmcs *vmcs = &vmx->vmcs[vcpu];
-
- static uint32_t type_map[VM_EVENT_MAX] = {
- 0x1, /* VM_EVENT_NONE */
- 0x0, /* VM_HW_INTR */
- 0x2, /* VM_NMI */
- 0x3, /* VM_HW_EXCEPTION */
- 0x4, /* VM_SW_INTR */
- 0x5, /* VM_PRIV_SW_EXCEPTION */
- 0x6, /* VM_SW_EXCEPTION */
- };
-
- /*
- * If there is already an exception pending to be delivered to the
- * vcpu then just return.
- */
- error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
- if (error)
- return (error);
-
- if (info & VMCS_INTR_VALID)
- return (EAGAIN);
-
- info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
- info |= VMCS_INTR_VALID;
- error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
- if (error != 0)
- return (error);
-
- if (code_valid) {
- error = vmcs_setreg(vmcs, 0,
- VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
- code);
- }
- return (error);
-}
-
-static int
vmx_getcap(void *arg, int vcpu, int type, int *retval)
{
struct vmx *vmx = arg;
@@ -2643,7 +2697,6 @@ struct vmm_ops vmm_ops_intel = {
vmx_setreg,
vmx_getdesc,
vmx_setdesc,
- vmx_inject,
vmx_getcap,
vmx_setcap,
ept_vmspace_alloc,
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 8fc3df3..6bf69d7 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -89,10 +89,13 @@ struct vcpu {
struct vlapic *vlapic;
int vcpuid;
struct savefpu *guestfpu; /* guest fpu state */
+ uint64_t guest_xcr0;
void *stats;
struct vm_exit exitinfo;
enum x2apic_state x2apic_state;
int nmi_pending;
+ struct vm_exception exception;
+ int exception_pending;
};
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
@@ -156,8 +159,6 @@ static struct vmm_ops *ops;
(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
#define VMSETDESC(vmi, vcpu, num, desc) \
(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
-#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
- (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
#define VMGETCAP(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETCAP(vmi, vcpu, num, val) \
@@ -206,6 +207,7 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
vcpu->vcpuid = vcpu_id;
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
+ vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
vcpu->guestfpu = fpu_save_area_alloc();
fpu_save_area_reset(vcpu->guestfpu);
vcpu->stats = vmm_stat_alloc();
@@ -815,6 +817,10 @@ restore_guest_fpustate(struct vcpu *vcpu)
fpu_stop_emulating();
fpurestore(vcpu->guestfpu);
+ /* restore guest XCR0 if XSAVE is enabled in the host */
+ if (rcr4() & CR4_XSAVE)
+ load_xcr(0, vcpu->guest_xcr0);
+
/*
* The FPU is now "dirty" with the guest's state so turn on emulation
* to trap any access to the FPU by the host.
@@ -829,6 +835,12 @@ save_guest_fpustate(struct vcpu *vcpu)
if ((rcr0() & CR0_TS) == 0)
panic("fpu emulation not enabled in host!");
+ /* save guest XCR0 and restore host XCR0 */
+ if (rcr4() & CR4_XSAVE) {
+ vcpu->guest_xcr0 = rxcr(0);
+ load_xcr(0, vmm_get_host_xcr0());
+ }
+
/* save guest FPU state */
fpu_stop_emulating();
fpusave(vcpu->guestfpu);
@@ -1214,19 +1226,91 @@ restart:
}
int
-vm_inject_event(struct vm *vm, int vcpuid, int type,
- int vector, uint32_t code, int code_valid)
+vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
{
+ struct vcpu *vcpu;
+
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
- if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+ if (exception->vector < 0 || exception->vector >= 32)
return (EINVAL);
- if (vector < 0 || vector > 255)
- return (EINVAL);
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpu->exception_pending) {
+ VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
+ "pending exception %d", exception->vector,
+ vcpu->exception.vector);
+ return (EBUSY);
+ }
+
+ vcpu->exception_pending = 1;
+ vcpu->exception = *exception;
+ VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
+ return (0);
+}
+
+int
+vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+ struct vcpu *vcpu;
+ int pending;
+
+ KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+ vcpu = &vm->vcpu[vcpuid];
+ pending = vcpu->exception_pending;
+ if (pending) {
+ vcpu->exception_pending = 0;
+ *exception = vcpu->exception;
+ VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
+ exception->vector);
+ }
+ return (pending);
+}
+
+static void
+vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+ struct vm_exit *vmexit;
+ int error;
+
+ error = vm_inject_exception(vm, vcpuid, exception);
+ KASSERT(error == 0, ("vm_inject_exception error %d", error));
+
+ /*
+ * A fault-like exception allows the instruction to be restarted
+ * after the exception handler returns.
+ *
+ * By setting the inst_length to 0 we ensure that the instruction
+ * pointer remains at the faulting instruction.
+ */
+ vmexit = vm_exitinfo(vm, vcpuid);
+ vmexit->inst_length = 0;
+}
+
+void
+vm_inject_gp(struct vm *vm, int vcpuid)
+{
+ struct vm_exception gpf = {
+ .vector = IDT_GP,
+ .error_code_valid = 1,
+ .error_code = 0
+ };
+
+ vm_inject_fault(vm, vcpuid, &gpf);
+}
+
+void
+vm_inject_ud(struct vm *vm, int vcpuid)
+{
+ struct vm_exception udf = {
+ .vector = IDT_UD,
+ .error_code_valid = 0
+ };
- return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+ vm_inject_fault(vm, vcpuid, &udf);
}
static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index e91f4a1..6db2b98 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -150,7 +150,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_register *vmreg;
struct vm_seg_desc *vmsegdesc;
struct vm_run *vmrun;
- struct vm_event *vmevent;
+ struct vm_exception *vmexc;
struct vm_lapic_irq *vmirq;
struct vm_lapic_msi *vmmsi;
struct vm_ioapic_irq *ioapic_irq;
@@ -181,7 +181,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_SET_REGISTER:
case VM_GET_SEGMENT_DESCRIPTOR:
case VM_SET_SEGMENT_DESCRIPTOR:
- case VM_INJECT_EVENT:
+ case VM_INJECT_EXCEPTION:
case VM_GET_CAPABILITY:
case VM_SET_CAPABILITY:
case VM_PPTDEV_MSI:
@@ -282,12 +282,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
- case VM_INJECT_EVENT:
- vmevent = (struct vm_event *)data;
- error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
- vmevent->vector,
- vmevent->error_code,
- vmevent->error_code_valid);
+ case VM_INJECT_EXCEPTION:
+ vmexc = (struct vm_exception *)data;
+ error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc);
break;
case VM_INJECT_NMI:
vmnmi = (struct vm_nmi *)data;
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
index 8dfef73..9e5b966 100644
--- a/sys/amd64/vmm/vmm_host.c
+++ b/sys/amd64/vmm/vmm_host.c
@@ -38,11 +38,14 @@ __FBSDID("$FreeBSD$");
#include "vmm_host.h"
-static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4,
+ vmm_host_xcr0;
+static struct xsave_limits vmm_xsave_limits;
void
vmm_host_state_init(void)
{
+ int regs[4];
vmm_host_efer = rdmsr(MSR_EFER);
vmm_host_pat = rdmsr(MSR_PAT);
@@ -57,6 +60,26 @@ vmm_host_state_init(void)
vmm_host_cr0 = rcr0() | CR0_TS;
vmm_host_cr4 = rcr4();
+
+ /*
+ * Only permit a guest to use XSAVE if the host is using
+ * XSAVE. Only permit a guest to use XSAVE features supported
+ * by the host. This ensures that the FPU state used by the
+ * guest is always a subset of the saved guest FPU state.
+ *
+ * In addition, only permit known XSAVE features where the
+ * rules for which features depend on other features is known
+ * to properly emulate xsetbv.
+ */
+ if (vmm_host_cr4 & CR4_XSAVE) {
+ vmm_xsave_limits.xsave_enabled = 1;
+ vmm_host_xcr0 = rxcr(0);
+ vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 &
+ (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512);
+
+ cpuid_count(0xd, 0x0, regs);
+ vmm_xsave_limits.xsave_max_size = regs[1];
+ }
}
uint64_t
@@ -88,6 +111,13 @@ vmm_get_host_cr4(void)
}
uint64_t
+vmm_get_host_xcr0(void)
+{
+
+ return (vmm_host_xcr0);
+}
+
+uint64_t
vmm_get_host_datasel(void)
{
@@ -122,3 +152,10 @@ vmm_get_host_idtrbase(void)
return (r_idt.rd_base);
}
+
+const struct xsave_limits *
+vmm_get_xsave_limits(void)
+{
+
+ return (&vmm_xsave_limits);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
index 839f54a..95618ff 100644
--- a/sys/amd64/vmm/vmm_host.h
+++ b/sys/amd64/vmm/vmm_host.h
@@ -33,17 +33,25 @@
#error "no user-servicable parts inside"
#endif
+struct xsave_limits {
+ int xsave_enabled;
+ uint64_t xcr0_allowed;
+ uint32_t xsave_max_size;
+};
+
void vmm_host_state_init(void);
uint64_t vmm_get_host_pat(void);
uint64_t vmm_get_host_efer(void);
uint64_t vmm_get_host_cr0(void);
uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_xcr0(void);
uint64_t vmm_get_host_datasel(void);
uint64_t vmm_get_host_codesel(void);
uint64_t vmm_get_host_tsssel(void);
uint64_t vmm_get_host_fsbase(void);
uint64_t vmm_get_host_idtrbase(void);
+const struct xsave_limits *vmm_get_xsave_limits(void);
/*
* Inline access to host state that is used on every VM entry
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index c1fc006e..455d865 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -30,17 +30,19 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#include <sys/types.h>
+#include <sys/pcpu.h>
#include <sys/systm.h>
#include <sys/cpuset.h>
#include <machine/clock.h>
#include <machine/cpufunc.h>
#include <machine/md_var.h>
+#include <machine/segments.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
+#include "vmm_host.h"
#include "x86.h"
#define CPUID_VM_HIGH 0x40000000
@@ -53,6 +55,8 @@ int
x86_emulate_cpuid(struct vm *vm, int vcpu_id,
uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
+ const struct xsave_limits *limits;
+ uint64_t cr4;
int error, enable_invpcid;
unsigned int func, regs[4];
enum x2apic_state x2apic_state;
@@ -147,11 +151,27 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
regs[2] |= CPUID2_X2APIC;
/*
- * Hide xsave/osxsave/avx until the FPU save/restore
- * issues are resolved
+ * Only advertise CPUID2_XSAVE in the guest if
+ * the host is using XSAVE.
*/
- regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
- CPUID2_AVX);
+ if (!(regs[2] & CPUID2_OSXSAVE))
+ regs[2] &= ~CPUID2_XSAVE;
+
+ /*
+ * If CPUID2_XSAVE is being advertised and the
+ * guest has set CR4_XSAVE, set
+ * CPUID2_OSXSAVE.
+ */
+ regs[2] &= ~CPUID2_OSXSAVE;
+ if (regs[2] & CPUID2_XSAVE) {
+ error = vm_get_register(vm, vcpu_id,
+ VM_REG_GUEST_CR4, &cr4);
+ if (error)
+ panic("x86_emulate_cpuid: error %d "
+ "fetching %%cr4", error);
+ if (cr4 & CR4_XSAVE)
+ regs[2] |= CPUID2_OSXSAVE;
+ }
/*
* Hide monitor/mwait until we know how to deal with
@@ -210,6 +230,26 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
/* leaf 0 */
if (*ecx == 0) {
+ cpuid_count(*eax, *ecx, regs);
+
+ /* Only leaf 0 is supported */
+ regs[0] = 0;
+
+ /*
+ * Expose known-safe features.
+ */
+ regs[1] &= (CPUID_STDEXT_FSGSBASE |
+ CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
+ CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
+ CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
+ CPUID_STDEXT_AVX512F |
+ CPUID_STDEXT_AVX512PF |
+ CPUID_STDEXT_AVX512ER |
+ CPUID_STDEXT_AVX512CD);
+ regs[2] = 0;
+ regs[3] = 0;
+
+ /* Advertise INVPCID if it is enabled. */
error = vm_get_capability(vm, vcpu_id,
VM_CAP_ENABLE_INVPCID, &enable_invpcid);
if (error == 0 && enable_invpcid)
@@ -219,7 +259,6 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
case CPUID_0000_0006:
case CPUID_0000_000A:
- case CPUID_0000_000D:
/*
* Handle the access, but report 0 for
* all options
@@ -240,6 +279,57 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
regs[3] = vcpu_id;
break;
+ case CPUID_0000_000D:
+ limits = vmm_get_xsave_limits();
+ if (!limits->xsave_enabled) {
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = 0;
+ regs[3] = 0;
+ break;
+ }
+
+ cpuid_count(*eax, *ecx, regs);
+ switch (*ecx) {
+ case 0:
+ /*
+ * Only permit the guest to use bits
+ * that are active in the host in
+ * %xcr0. Also, claim that the
+ * maximum save area size is
+ * equivalent to the host's current
+ * save area size. Since this runs
+ * "inside" of vmrun(), it runs with
+ * the guest's xcr0, so the current
+ * save area size is correct as-is.
+ */
+ regs[0] &= limits->xcr0_allowed;
+ regs[2] = limits->xsave_max_size;
+ regs[3] &= (limits->xcr0_allowed >> 32);
+ break;
+ case 1:
+ /* Only permit XSAVEOPT. */
+ regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
+ regs[1] = 0;
+ regs[2] = 0;
+ regs[3] = 0;
+ break;
+ default:
+ /*
+ * If the leaf is for a permitted feature,
+ * pass through as-is, otherwise return
+ * all zeroes.
+ */
+ if (!(limits->xcr0_allowed & (1ul << *ecx))) {
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = 0;
+ regs[3] = 0;
+ }
+ break;
+ }
+ break;
+
case 0x40000000:
regs[0] = CPUID_VM_HIGH;
bcopy(bhyve_id, &regs[1], 4);
OpenPOWER on IntegriCloud