MFC 261638,262144,262506,266765:

Add virtualized XSAVE support to bhyve which permits guests to use XSAVE and XSAVE-enabled features like AVX. - Store a per-cpu guest xcr0 register and handle xsetbv VM exits by emulating the instruction. - Only expose XSAVE to guests if XSAVE is enabled in the host. Only expose a subset of XSAVE features currently supported by the guest and for which the proper emulation of xsetbv is known. Currently this includes X87, SSE, AVX, AVX-512, and Intel MPX. - Add support for injecting hardware exceptions into the guest and use this to trigger exceptions in the guest for invalid xsetbv operations instead of potentially faulting in the host. - Queue pending exceptions in the 'struct vcpu' instead of directly updating the processor-specific VMCS or VMCB. The pending exception will be delivered right before entering the guest. - Rename the unused ioctl VM_INJECT_EVENT to VM_INJECT_EXCEPTION and restrict it to only deliver x86 hardware exceptions. This new ioctl is now used to inject a protection fault when the guest accesses an unimplemented MSR. - Expose a subset of known-safe features from leaf 0 of the structured extended features to guests if they are supported on the host including RDFSBASE/RDGSBASE, BMI1/2, AVX2, AVX-512, HLE, ERMS, and RTM. Aside from AVX-512, these features are all new instructions available for use in ring 3 with no additional hypervisor changes needed.
author: jhb <jhb@FreeBSD.org> 2014-06-12 19:58:12 +0000
committer: jhb <jhb@FreeBSD.org> 2014-06-12 19:58:12 +0000
commit: 3e1f2ae835422b06a1812ad150561de8d78e7998 (patch)
tree: a88a56bf8fd9c614d7ab81310cc3f2e44f3a1606 /sys/amd64
parent: 572e5ac4530c8a38b1fb79a4c94be7f76df567f4 (diff)
download: FreeBSD-src-3e1f2ae835422b06a1812ad150561de8d78e7998.zip
FreeBSD-src-3e1f2ae835422b06a1812ad150561de8d78e7998.tar.gz
10 files changed, 377 insertions, 109 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 8b6933a..e9a3db9 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -34,6 +34,7 @@
 #define	VM_MAX_NAMELEN	32
 
 struct vm;
+struct vm_exception;
 struct vm_memory_segment;
 struct seg_desc;
 struct vm_exit;
@@ -62,9 +63,6 @@ typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
-typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
-				      int type, int vector,
-				      uint32_t code, int code_valid);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
@@ -84,7 +82,6 @@ struct vmm_ops {
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
-	vmi_inject_event_t	vminject;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
 	vmi_vmspace_alloc	vmspace_alloc;
@@ -117,8 +114,6 @@ int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vm *vm, struct vm_run *vmrun);
-int vm_inject_event(struct vm *vm, int vcpu, int type,
-		    int vector, uint32_t error_code, int error_code_valid);
 int vm_inject_nmi(struct vm *vm, int vcpu);
 int vm_nmi_pending(struct vm *vm, int vcpuid);
 void vm_nmi_clear(struct vm *vm, int vcpuid);
@@ -192,6 +187,33 @@ void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
+
+/*
+ * Inject exception 'vme' into the guest vcpu. This function returns 0 on
+ * success and non-zero on failure.
+ *
+ * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
+ * this function directly because they enforce the trap-like or fault-like
+ * behavior of an exception.
+ *
+ * This function should only be called in the context of the thread that is
+ * executing this vcpu.
+ */
+int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
+
+/*
+ * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
+ * exception is pending and also updates 'vme'. The pending exception is
+ * cleared when this function returns.
+ *
+ * This function should only be called in the context of the thread that is
+ * executing this vcpu.
+ */
+int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
+
+void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
+void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
+
 #endif	/* KERNEL */
 
 #include <machine/vmm_instruction_emul.h>
@@ -199,20 +221,6 @@ int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
 
 /*
- * Identifiers for events that can be injected into the VM
- */
-enum vm_event_type {
-	VM_EVENT_NONE,
-	VM_HW_INTR,
-	VM_NMI,
-	VM_HW_EXCEPTION,
-	VM_SW_INTR,
-	VM_PRIV_SW_EXCEPTION,
-	VM_SW_EXCEPTION,
-	VM_EVENT_MAX
-};
-
-/*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index bacbebc..c75f5cf 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -58,9 +58,8 @@ struct vm_run {
 	struct vm_exit	vm_exit;
 };
 
-struct vm_event {
+struct vm_exception {
 	int		cpuid;
-	enum vm_event_type type;
 	int		vector;
 	uint32_t	error_code;
 	int		error_code_valid;
@@ -174,7 +173,7 @@ enum {
 	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
 
 	/* interrupt injection */
-	IOCNUM_INJECT_EVENT = 30,
+	IOCNUM_INJECT_EXCEPTION = 30,
 	IOCNUM_LAPIC_IRQ = 31,
 	IOCNUM_INJECT_NMI = 32,
 	IOCNUM_IOAPIC_ASSERT_IRQ = 33,
@@ -215,8 +214,8 @@ enum {
 	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
 #define	VM_GET_SEGMENT_DESCRIPTOR \
 	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
-#define	VM_INJECT_EVENT	\
-	_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define	VM_INJECT_EXCEPTION	\
+	_IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
 #define	VM_LAPIC_IRQ 		\
 	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
 #define	VM_LAPIC_LOCAL_IRQ 	\
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index 00484c7..39f0ef7 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -115,15 +115,6 @@ amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
 }
 
 static int
-amdv_inject_event(void *vmi, int vcpu, int type, int vector,
-		  uint32_t error_code, int error_code_valid)
-{
-
-	printf("amdv_inject_event: not implemented\n");
-	return (EINVAL);
-}
-
-static int
 amdv_getcap(void *arg, int vcpu, int type, int *retval)
 {
 
@@ -180,7 +171,6 @@ struct vmm_ops vmm_ops_amd = {
 	amdv_setreg,
 	amdv_getdesc,
 	amdv_setdesc,
-	amdv_inject_event,
 	amdv_getcap,
 	amdv_setcap,
 	amdv_vmspace_alloc,
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 08e07e7..9cde999 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -345,6 +345,8 @@ vmcs_write(uint32_t encoding, uint64_t val)
 #define	VMCS_INTR_T_MASK	0x700		/* Interruption-info type */
 #define	VMCS_INTR_T_HWINTR	(0 << 8)
 #define	VMCS_INTR_T_NMI		(2 << 8)
+#define	VMCS_INTR_T_HWEXCEPTION	(3 << 8)
+#define	VMCS_INTR_DEL_ERRCODE	(1 << 11)
 
 /*
  * VMCS IDT-Vectoring information fields
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 28d6504..2d4f376 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_dev.h>
 #include "vmm_host.h"
 #include "vmm_ipi.h"
 #include "vmm_msr.h"
@@ -1090,10 +1091,27 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 {
+	struct vm_exception exc;
 	int vector, need_nmi_exiting;
 	uint64_t rflags;
 	uint32_t gi, info;
 
+	if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
+		KASSERT(exc.vector >= 0 && exc.vector < 32,
+		    ("%s: invalid exception vector %d", __func__, exc.vector));
+
+		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
+		     "pending exception %d: %#x", __func__, exc.vector, info));
+
+		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
+		if (exc.error_code_valid) {
+			info |= VMCS_INTR_DEL_ERRCODE;
+			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+		}
+		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+	}
+
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
 		/*
 		 * If there are no conditions blocking NMI injection then
@@ -1169,6 +1187,7 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 		 * This is expected and could happen for multiple reasons:
 		 * - A vectoring VM-entry was aborted due to astpending
 		 * - A VM-exit happened during event injection.
+		 * - An exception was injected above.
 		 * - An NMI was injected above or after "NMI window exiting"
 		 */
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
@@ -1228,6 +1247,82 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 }
 
 static int
+vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+	struct vmxctx *vmxctx;
+	uint64_t xcrval;
+	const struct xsave_limits *limits;
+
+	vmxctx = &vmx->ctx[vcpu];
+	limits = vmm_get_xsave_limits();
+
+	/*
+	 * Note that the processor raises a GP# fault on its own if
+	 * xsetbv is executed for CPL != 0, so we do not have to
+	 * emulate that fault here.
+	 */
+
+	/* Only xcr0 is supported. */
+	if (vmxctx->guest_rcx != 0) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
+	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
+		vm_inject_ud(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
+	if ((xcrval & ~limits->xcr0_allowed) != 0) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	if (!(xcrval & XFEATURE_ENABLED_X87)) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/* AVX (YMM_Hi128) requires SSE. */
+	if (xcrval & XFEATURE_ENABLED_AVX &&
+	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/*
+	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
+	 * ZMM_Hi256, and Hi16_ZMM.
+	 */
+	if (xcrval & XFEATURE_AVX512 &&
+	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
+	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/*
+	 * Intel MPX requires both bound register state flags to be
+	 * set.
+	 */
+	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
+	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/*
+	 * This runs "inside" vmrun() with the guest's FPU state, so
+	 * modifying xcr0 directly modifies the guest's xcr0, not the
+	 * host's.
+	 */
+	load_xcr(0, xcrval);
+	return (HANDLED);
+}
+
+static int
 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	int cr, vmcs_guest_cr, vmcs_shadow_cr;
@@ -1413,7 +1508,7 @@ vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
 	if (!virtual_interrupt_delivery)
 		return (UNHANDLED);
 
-	handled = 1;
+	handled = HANDLED;
 	offset = APIC_WRITE_OFFSET(qual);
 	switch (offset) {
 	case APIC_OFFSET_ID:
@@ -1435,7 +1530,7 @@ vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
 		retu = false;
 		error = vlapic_icrlo_write_handler(vlapic, &retu);
 		if (error != 0 || retu)
-			handled = 0;
+			handled = UNHANDLED;
 		break;
 	case APIC_OFFSET_CMCI_LVT:
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
@@ -1448,7 +1543,7 @@ vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
 		vlapic_dcr_write_handler(vlapic);
 		break;
 	default:
-		handled = 0;
+		handled = UNHANDLED;
 		break;
 	}
 	return (handled);
@@ -1548,7 +1643,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
-	handled = 0;
+	handled = UNHANDLED;
 	vmxctx = &vmx->ctx[vcpu];
 
 	qual = vmexit->u.vmx.exit_qualification;
@@ -1611,7 +1706,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
 		} else if (!retu) {
-			handled = 1;
+			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
@@ -1631,7 +1726,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 		} else if (!retu) {
-			handled = 1;
+			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
@@ -1773,6 +1868,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		handled = vmx_handle_apic_write(vlapic, qual);
 		break;
+	case EXIT_REASON_XSETBV:
+		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
+		break;
 	default:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
@@ -2198,50 +2296,6 @@ vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 }
 
 static int
-vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
-	   int code_valid)
-{
-	int error;
-	uint64_t info;
-	struct vmx *vmx = arg;
-	struct vmcs *vmcs = &vmx->vmcs[vcpu];
-
-	static uint32_t type_map[VM_EVENT_MAX] = {
-		0x1,		/* VM_EVENT_NONE */
-		0x0,		/* VM_HW_INTR */
-		0x2,		/* VM_NMI */
-		0x3,		/* VM_HW_EXCEPTION */
-		0x4,		/* VM_SW_INTR */
-		0x5,		/* VM_PRIV_SW_EXCEPTION */
-		0x6,		/* VM_SW_EXCEPTION */
-	};
-
-	/*
-	 * If there is already an exception pending to be delivered to the
-	 * vcpu then just return.
-	 */
-	error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
-	if (error)
-		return (error);
-
-	if (info & VMCS_INTR_VALID)
-		return (EAGAIN);
-
-	info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
-	info |= VMCS_INTR_VALID;
-	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
-	if (error != 0)
-		return (error);
-
-	if (code_valid) {
-		error = vmcs_setreg(vmcs, 0,
-				    VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
-				    code);
-	}
-	return (error);
-}
-
-static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
@@ -2643,7 +2697,6 @@ struct vmm_ops vmm_ops_intel = {
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
-	vmx_inject,
 	vmx_getcap,
 	vmx_setcap,
 	ept_vmspace_alloc,
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 8fc3df3..6bf69d7 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -89,10 +89,13 @@ struct vcpu {
 	struct vlapic	*vlapic;
 	int		 vcpuid;
 	struct savefpu	*guestfpu;	/* guest fpu state */
+	uint64_t	guest_xcr0;
 	void		*stats;
 	struct vm_exit	exitinfo;
 	enum x2apic_state x2apic_state;
 	int		nmi_pending;
+	struct vm_exception exception;
+	int		exception_pending;
 };
 
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
@@ -156,8 +159,6 @@ static struct vmm_ops *ops;
 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMSETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
-#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
-	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
@@ -206,6 +207,7 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
 	vcpu->vcpuid = vcpu_id;
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
+	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	vcpu->guestfpu = fpu_save_area_alloc();
 	fpu_save_area_reset(vcpu->guestfpu);
 	vcpu->stats = vmm_stat_alloc();
@@ -815,6 +817,10 @@ restore_guest_fpustate(struct vcpu *vcpu)
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
+	/* restore guest XCR0 if XSAVE is enabled in the host */
+	if (rcr4() & CR4_XSAVE)
+		load_xcr(0, vcpu->guest_xcr0);
+
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
@@ -829,6 +835,12 @@ save_guest_fpustate(struct vcpu *vcpu)
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
+	/* save guest XCR0 and restore host XCR0 */
+	if (rcr4() & CR4_XSAVE) {
+		vcpu->guest_xcr0 = rxcr(0);
+		load_xcr(0, vmm_get_host_xcr0());
+	}
+
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
@@ -1214,19 +1226,91 @@ restart:
 }
 
 int
-vm_inject_event(struct vm *vm, int vcpuid, int type,
-		int vector, uint32_t code, int code_valid)
+vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
+	struct vcpu *vcpu;
+
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
-	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+	if (exception->vector < 0 || exception->vector >= 32)
 		return (EINVAL);
 
-	if (vector < 0 || vector > 255)
-		return (EINVAL);
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpu->exception_pending) {
+		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
+		    "pending exception %d", exception->vector,
+		    vcpu->exception.vector);
+		return (EBUSY);
+	}
+
+	vcpu->exception_pending = 1;
+	vcpu->exception = *exception;
+	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
+	return (0);
+}
+
+int
+vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+	struct vcpu *vcpu;
+	int pending;
+
+	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+	vcpu = &vm->vcpu[vcpuid];
+	pending = vcpu->exception_pending;
+	if (pending) {
+		vcpu->exception_pending = 0;
+		*exception = vcpu->exception;
+		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
+		    exception->vector);
+	}
+	return (pending);
+}
+
+static void
+vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+	struct vm_exit *vmexit;
+	int error;
+
+	error = vm_inject_exception(vm, vcpuid, exception);
+	KASSERT(error == 0, ("vm_inject_exception error %d", error));
+
+	/*
+	 * A fault-like exception allows the instruction to be restarted
+	 * after the exception handler returns.
+	 *
+	 * By setting the inst_length to 0 we ensure that the instruction
+	 * pointer remains at the faulting instruction.
+	 */
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->inst_length = 0;
+}
+
+void
+vm_inject_gp(struct vm *vm, int vcpuid)
+{
+	struct vm_exception gpf = {
+		.vector = IDT_GP,
+		.error_code_valid = 1,
+		.error_code = 0
+	};
+
+	vm_inject_fault(vm, vcpuid, &gpf);
+}
+
+void
+vm_inject_ud(struct vm *vm, int vcpuid)
+{
+	struct vm_exception udf = {
+		.vector = IDT_UD,
+		.error_code_valid = 0
+	};
 
-	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+	vm_inject_fault(vm, vcpuid, &udf);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index e91f4a1..6db2b98 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -150,7 +150,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_register *vmreg;
 	struct vm_seg_desc *vmsegdesc;
 	struct vm_run *vmrun;
-	struct vm_event *vmevent;
+	struct vm_exception *vmexc;
 	struct vm_lapic_irq *vmirq;
 	struct vm_lapic_msi *vmmsi;
 	struct vm_ioapic_irq *ioapic_irq;
@@ -181,7 +181,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_SET_REGISTER:
 	case VM_GET_SEGMENT_DESCRIPTOR:
 	case VM_SET_SEGMENT_DESCRIPTOR:
-	case VM_INJECT_EVENT:
+	case VM_INJECT_EXCEPTION:
 	case VM_GET_CAPABILITY:
 	case VM_SET_CAPABILITY:
 	case VM_PPTDEV_MSI:
@@ -282,12 +282,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
 					   pptdev->func);
 		break;
-	case VM_INJECT_EVENT:
-		vmevent = (struct vm_event *)data;
-		error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
-					vmevent->vector,
-					vmevent->error_code,
-					vmevent->error_code_valid);
+	case VM_INJECT_EXCEPTION:
+		vmexc = (struct vm_exception *)data;
+		error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc);
 		break;
 	case VM_INJECT_NMI:
 		vmnmi = (struct vm_nmi *)data;
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
index 8dfef73..9e5b966 100644
--- a/sys/amd64/vmm/vmm_host.c
+++ b/sys/amd64/vmm/vmm_host.c
@@ -38,11 +38,14 @@ __FBSDID("$FreeBSD$");
 
 #include "vmm_host.h"
 
-static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4,
+	vmm_host_xcr0;
+static struct xsave_limits vmm_xsave_limits;
 
 void
 vmm_host_state_init(void)
 {
+	int regs[4];
 
 	vmm_host_efer = rdmsr(MSR_EFER);
 	vmm_host_pat = rdmsr(MSR_PAT);
@@ -57,6 +60,26 @@ vmm_host_state_init(void)
 	vmm_host_cr0 = rcr0() | CR0_TS;
 
 	vmm_host_cr4 = rcr4();
+
+	/*
+	 * Only permit a guest to use XSAVE if the host is using
+	 * XSAVE.  Only permit a guest to use XSAVE features supported
+	 * by the host.  This ensures that the FPU state used by the
+	 * guest is always a subset of the saved guest FPU state.
+	 *
+	 * In addition, only permit known XSAVE features where the
+	 * rules for which features depend on other features is known
+	 * to properly emulate xsetbv.
+	 */
+	if (vmm_host_cr4 & CR4_XSAVE) {
+		vmm_xsave_limits.xsave_enabled = 1;
+		vmm_host_xcr0 = rxcr(0);
+		vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 &
+		    (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512);
+
+		cpuid_count(0xd, 0x0, regs);
+		vmm_xsave_limits.xsave_max_size = regs[1];
+	}
 }
 
 uint64_t
@@ -88,6 +111,13 @@ vmm_get_host_cr4(void)
 }
 
 uint64_t
+vmm_get_host_xcr0(void)
+{
+
+	return (vmm_host_xcr0);
+}
+
+uint64_t
 vmm_get_host_datasel(void)
 {
 
@@ -122,3 +152,10 @@ vmm_get_host_idtrbase(void)
 
 	return (r_idt.rd_base);
 }
+
+const struct xsave_limits *
+vmm_get_xsave_limits(void)
+{
+
+	return (&vmm_xsave_limits);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
index 839f54a..95618ff 100644
--- a/sys/amd64/vmm/vmm_host.h
+++ b/sys/amd64/vmm/vmm_host.h
@@ -33,17 +33,25 @@
 #error "no user-servicable parts inside"
 #endif
 
+struct xsave_limits {
+	int		xsave_enabled;
+	uint64_t	xcr0_allowed;
+	uint32_t	xsave_max_size;
+};
+
 void vmm_host_state_init(void);
 
 uint64_t vmm_get_host_pat(void);
 uint64_t vmm_get_host_efer(void);
 uint64_t vmm_get_host_cr0(void);
 uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_xcr0(void);
 uint64_t vmm_get_host_datasel(void);
 uint64_t vmm_get_host_codesel(void);
 uint64_t vmm_get_host_tsssel(void);
 uint64_t vmm_get_host_fsbase(void);
 uint64_t vmm_get_host_idtrbase(void);
+const struct xsave_limits *vmm_get_xsave_limits(void);
 
 /*
  * Inline access to host state that is used on every VM entry
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index c1fc006e..455d865 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -30,17 +30,19 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/types.h>
+#include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
+#include <machine/segments.h>
 #include <machine/specialreg.h>
 
 #include <machine/vmm.h>
 
+#include "vmm_host.h"
 #include "x86.h"
 
 #define	CPUID_VM_HIGH		0x40000000
@@ -53,6 +55,8 @@ int
 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
+	const struct xsave_limits *limits;
+	uint64_t cr4;
 	int error, enable_invpcid;
 	unsigned int 	func, regs[4];
 	enum x2apic_state x2apic_state;
@@ -147,11 +151,27 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 				regs[2] |= CPUID2_X2APIC;
 
 			/*
-			 * Hide xsave/osxsave/avx until the FPU save/restore
-			 * issues are resolved
+			 * Only advertise CPUID2_XSAVE in the guest if
+			 * the host is using XSAVE.
 			 */
-			regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
-				     CPUID2_AVX);
+			if (!(regs[2] & CPUID2_OSXSAVE))
+				regs[2] &= ~CPUID2_XSAVE;
+
+			/*
+			 * If CPUID2_XSAVE is being advertised and the
+			 * guest has set CR4_XSAVE, set
+			 * CPUID2_OSXSAVE.
+			 */
+			regs[2] &= ~CPUID2_OSXSAVE;
+			if (regs[2] & CPUID2_XSAVE) {
+				error = vm_get_register(vm, vcpu_id,
+				    VM_REG_GUEST_CR4, &cr4);
+				if (error)
+					panic("x86_emulate_cpuid: error %d "
+					      "fetching %%cr4", error);
+				if (cr4 & CR4_XSAVE)
+					regs[2] |= CPUID2_OSXSAVE;
+			}
 
 			/*
 			 * Hide monitor/mwait until we know how to deal with
@@ -210,6 +230,26 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 
 			/* leaf 0 */
 			if (*ecx == 0) {
+				cpuid_count(*eax, *ecx, regs);
+
+				/* Only leaf 0 is supported */
+				regs[0] = 0;
+
+				/*
+				 * Expose known-safe features.
+				 */
+				regs[1] &= (CPUID_STDEXT_FSGSBASE |
+				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
+				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
+				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
+				    CPUID_STDEXT_AVX512F |
+				    CPUID_STDEXT_AVX512PF |
+				    CPUID_STDEXT_AVX512ER |
+				    CPUID_STDEXT_AVX512CD);
+				regs[2] = 0;
+				regs[3] = 0;
+
+				/* Advertise INVPCID if it is enabled. */
 				error = vm_get_capability(vm, vcpu_id,
 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
 				if (error == 0 && enable_invpcid)
@@ -219,7 +259,6 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 
 		case CPUID_0000_0006:
 		case CPUID_0000_000A:
-		case CPUID_0000_000D:
 			/*
 			 * Handle the access, but report 0 for
 			 * all options
@@ -240,6 +279,57 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			regs[3] = vcpu_id;
 			break;
 
+		case CPUID_0000_000D:
+			limits = vmm_get_xsave_limits();
+			if (!limits->xsave_enabled) {
+				regs[0] = 0;
+				regs[1] = 0;
+				regs[2] = 0;
+				regs[3] = 0;
+				break;
+			}
+
+			cpuid_count(*eax, *ecx, regs);
+			switch (*ecx) {
+			case 0:
+				/*
+				 * Only permit the guest to use bits
+				 * that are active in the host in
+				 * %xcr0.  Also, claim that the
+				 * maximum save area size is
+				 * equivalent to the host's current
+				 * save area size.  Since this runs
+				 * "inside" of vmrun(), it runs with
+				 * the guest's xcr0, so the current
+				 * save area size is correct as-is.
+				 */
+				regs[0] &= limits->xcr0_allowed;
+				regs[2] = limits->xsave_max_size;
+				regs[3] &= (limits->xcr0_allowed >> 32);
+				break;
+			case 1:
+				/* Only permit XSAVEOPT. */
+				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
+				regs[1] = 0;
+				regs[2] = 0;
+				regs[3] = 0;
+				break;
+			default:
+				/*
+				 * If the leaf is for a permitted feature,
+				 * pass through as-is, otherwise return
+				 * all zeroes.
+				 */
+				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
+					regs[0] = 0;
+					regs[1] = 0;
+					regs[2] = 0;
+					regs[3] = 0;
+				}
+				break;
+			}
+			break;
+
 		case 0x40000000:
 			regs[0] = CPUID_VM_HIGH;
 			bcopy(bhyve_id, &regs[1], 4);
author	jhb <jhb@FreeBSD.org>	2014-06-12 19:58:12 +0000
committer	jhb <jhb@FreeBSD.org>	2014-06-12 19:58:12 +0000
commit	3e1f2ae835422b06a1812ad150561de8d78e7998 (patch)
tree	a88a56bf8fd9c614d7ab81310cc3f2e44f3a1606 /sys/amd64
parent	572e5ac4530c8a38b1fb79a4c94be7f76df567f4 (diff)
download	FreeBSD-src-3e1f2ae835422b06a1812ad150561de8d78e7998.zip FreeBSD-src-3e1f2ae835422b06a1812ad150561de8d78e7998.tar.gz