MFC r282209:

Emulate the 'bit test' instruction. MFC r282259: Re-implement RTC current time calculation to eliminate the possibility of losing time. MFC r282281: Advertise the MTRR feature via CPUID and emulate the minimal set of MTRR MSRs. MFC r282284: When an instruction cannot be decoded just return to userspace so bhyve(8) can dump the instruction bytes. MFC r282287: Don't require <sys/cpuset.h> to be always included before <machine/vmm.h>. MFC r282296: Emulate MSR_SYSCFG which is accessed by Linux on AMD cpus when MTRRs are enabled. MFC r282301: Relax limits when transitioning a vector from the IRR to the ISR and also when extinguishing it from the ISR in response to an EOI. MFC r282335: Advertise an additional memory BAR in the "dummy" device emulation. MFC r282336: Emulate machine check related MSRs to allow guest OSes like Windows to boot. MFC r282351: Don't advertise the Intel SMX capability to the guest. MFC r282407: Emulate the 'CMP r/m8, imm8' instruction. MFC r282519: Add macros for AMD-specific bits in MSR_EFER: LMSLE, FFXSR and TCE. MFC r282520: Emulate guest writes to EFER_MSR properly. MFC r282558: Deprecate the 3-way return values from vm_gla2gpa() and vm_copy_setup(). MFC r282571: Check 'td_owepreempt' and yield the vcpu thread if it is set. MFC r282595: Allow byte reads of AHCI registers. MFC r282784: Handling indirect descriptors is a capability of the host and not one that needs to be negotiated. Use the host capabilities field and not the negotiated field when verifying that indirect descriptors are supported. MFC r282788: Allow configuration of the sector size advertised to the guest. MFC r282865: Set the subvendor field in config space to the vendor ID. This is required by the Windows virtio drivers to correctly match a device. MFC r282922: Bump the size of the blockif scatter-gather list to 67. MFC r283075: Fix off-by-one in array index bounds check. bhyveload would allow you to create 33 entries on an array that only has 32 slots MFC r283168: Temporarily revert r282922 which bumped the max descriptors. MFC r283255: Emulate the "CMP r/m, reg" instruction (opcode 39H). MFC r283256: Add an option "--get-vmcs-exit-inst-length" to display the instruction length of the instruction that caused the VM-exit. MFC r283264: Change the header type of the emulated host-bridge from type 1 to type 0. MFC r283293: Don't rely on the 'VM-exit instruction length' field in the VMCS to always have an accurate length on an EPT violation. MFC r283299: Remove bogus verification of instruction length after instruction decode. MFC r283308: Exceptions don't deliver an error code in real mode. MFC r283657: Fix non-deterministic delays when accessing a vcpu that was in "running" or "sleeping" state. MFC r283973: Use tunable 'hw.vmm.svm.features' to disable specific SVM features even though they might be available in hardware. Use tunable 'hw.vmm.svm.num_asids' to limit the number of ASIDs used by the hypervisor. MFC r284046: Fix regression in 'verify_gla()' with the RIP-relative addressing mode. MFC r284174: Support guest writes to the TSC by enabling the "use TSC offsetting" execution control.
author: neel <neel@FreeBSD.org> 2015-06-28 03:22:26 +0000
committer: neel <neel@FreeBSD.org> 2015-06-28 03:22:26 +0000
commit: 79b96fdbcb5c0722d7886ea1fe175b7b7e4d751f (patch)
tree: 8cff46bd40a19bfc6a7b9831b2f3db23c5c757c0
parent: c85aee0195bf01e2e4666927083768c4f182c0c0 (diff)
download: FreeBSD-src-79b96fdbcb5c0722d7886ea1fe175b7b7e4d751f.zip
FreeBSD-src-79b96fdbcb5c0722d7886ea1fe175b7b7e4d751f.tar.gz
41 files changed, 851 insertions, 405 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 0c15845..1e6e627 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/param.h>
 
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
@@ -958,9 +959,9 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
 	return (error);
 }
 
-static int
-gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, int *fault, uint64_t *gpa)
+int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa, int *fault)
 {
 	struct vm_gla2gpa gg;
 	int error;
@@ -979,29 +980,18 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 	return (error);
 }
 
-int
-vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa)
-{
-	int error, fault;
-
-	error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa);
-	if (fault)
-		error = fault;
-	return (error);
-}
-
 #ifndef min
 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
 #endif
 
 int
 vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
+    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
+    int *fault)
 {
 	void *va;
 	uint64_t gpa;
-	int error, fault, i, n, off;
+	int error, i, n, off;
 
 	for (i = 0; i < iovcnt; i++) {
 		iov[i].iov_base = 0;
@@ -1010,18 +1000,16 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 
 	while (len) {
 		assert(iovcnt > 0);
-		error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa);
-		if (error)
-			return (-1);
-		if (fault)
-			return (1);
+		error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault);
+		if (error || *fault)
+			return (error);
 
 		off = gpa & PAGE_MASK;
 		n = min(len, PAGE_SIZE - off);
 
 		va = vm_map_gpa(ctx, gpa, n);
 		if (va == NULL)
-			return (-1);
+			return (EFAULT);
 
 		iov->iov_base = va;
 		iov->iov_len = n;
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index d001cd8..d3ecdc4 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -64,7 +64,7 @@ int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 int	vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
-		   uint64_t gla, int prot, uint64_t *gpa);
+		   uint64_t gla, int prot, uint64_t *gpa, int *fault);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);
@@ -131,10 +131,15 @@ int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
 /*
  * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
  * The 'iovcnt' should be big enough to accomodate all GPA segments.
- * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
+ *
+ * retval	fault		Interpretation
+ *   0		  0		Success
+ *   0		  1		An exception was injected into the guest
+ * EFAULT	 N/A		Error
  */
 int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
-	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
+	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
+	    int *fault);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 52294bd..1a4e5ab 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -120,13 +120,18 @@ struct vm_object;
 struct vm_guest_paging;
 struct pmap;
 
+struct vm_eventinfo {
+	void	*rptr;		/* rendezvous cookie */
+	int	*sptr;		/* suspend cookie */
+	int	*iptr;		/* reqidle cookie */
+};
+
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
-				  struct pmap *pmap, void *rendezvous_cookie,
-				  void *suspend_cookie);
+		    struct pmap *pmap, struct vm_eventinfo *info);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
@@ -204,13 +209,13 @@ int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
-cpuset_t vm_active_cpus(struct vm *vm);
-cpuset_t vm_suspended_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
 
+#ifdef _SYS__CPUSET_H_
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
@@ -228,19 +233,29 @@ void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
+cpuset_t vm_active_cpus(struct vm *vm);
+cpuset_t vm_suspended_cpus(struct vm *vm);
+#endif	/* _SYS__CPUSET_H_ */
 
 static __inline int
-vcpu_rendezvous_pending(void *rendezvous_cookie)
+vcpu_rendezvous_pending(struct vm_eventinfo *info)
 {
 
-	return (*(uintptr_t *)rendezvous_cookie != 0);
+	return (*((uintptr_t *)(info->rptr)) != 0);
 }
 
 static __inline int
-vcpu_suspended(void *suspend_cookie)
+vcpu_suspended(struct vm_eventinfo *info)
 {
 
-	return (*(int *)suspend_cookie);
+	return (*info->sptr);
+}
+
+static __inline int
+vcpu_reqidle(struct vm_eventinfo *info)
+{
+
+	return (*info->iptr);
 }
 
 /*
@@ -274,7 +289,13 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 static int __inline
 vcpu_should_yield(struct vm *vm, int vcpu)
 {
-	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
+
+	if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED))
+		return (1);
+	else if (curthread->td_owepreempt)
+		return (1);
+	else
+		return (0);
 }
 #endif
 
@@ -343,9 +364,10 @@ struct vm_copyinfo {
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
- * Returns 0 on success.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
+ * retval	is_fault	Intepretation
+ *   0		   0		Success
+ *   0		   1		An exception was injected into the guest
+ * EFAULT	  N/A		Unrecoverable error
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
@@ -353,7 +375,7 @@ struct vm_copyinfo {
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
-    int num_copyinfo);
+    int num_copyinfo, int *is_fault);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
@@ -497,6 +519,7 @@ enum vm_exitcode {
 	VM_EXITCODE_MONITOR,
 	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_SVM,
+	VM_EXITCODE_REQIDLE,
 	VM_EXITCODE_MAX
 };
 
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index 651b3b3..5e7127f 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -81,17 +81,19 @@ int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
  */
 int vmm_fetch_instruction(struct vm *vm, int cpuid,
 			  struct vm_guest_paging *guest_paging,
-			  uint64_t rip, int inst_length, struct vie *vie);
+			  uint64_t rip, int inst_length, struct vie *vie,
+			  int *is_fault);
 
 /*
  * Translate the guest linear address 'gla' to a guest physical address.
  *
- * Returns 0 on success and '*gpa' contains the result of the translation.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
+ * retval	is_fault	Interpretation
+ *   0		   0		'gpa' contains result of the translation
+ *   0		   1		An exception was injected into the guest
+ * EFAULT	  N/A		An unrecoverable hypervisor error occurred
  */
 int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa);
+    uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
 
 void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
 
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index acb3a3d..3157e21 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/smp.h>
 
 #include <machine/vmm.h>
 #include "io/iommu.h"
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index 7cc13ca..b25d69d 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -102,8 +102,8 @@ static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 /* Per-CPU context area. */
 extern struct pcpu __pcpu[];
 
-static uint32_t svm_feature;	/* AMD SVM features. */
-SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0,
+static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
     "SVM features advertised by CPUID.8000000AH:EDX");
 
 static int disable_npf_assist;
@@ -112,7 +112,7 @@ SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
 
 /* Maximum ASIDs supported by the processor */
 static uint32_t nasid;
-SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0,
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
     "Number of ASIDs supported by this processor");
 
 /* Current ASID generation for each host cpu */
@@ -174,9 +174,14 @@ check_svm_features(void)
 
 	/* CPUID Fn8000_000A is for SVM */
 	do_cpuid(0x8000000A, regs);
-	svm_feature = regs[3];
+	svm_feature &= regs[3];
 
-	nasid = regs[1];
+	/*
+	 * The number of ASIDs can be configured to be less than what is
+	 * supported by the hardware but not more.
+	 */
+	if (nasid == 0 || nasid > regs[1])
+		nasid = regs[1];
 	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
 
 	/* bhyve requires the Nested Paging feature */
@@ -564,6 +569,19 @@ svm_vminit(struct vm *vm, pmap_t pmap)
 	return (svm_sc);
 }
 
+/*
+ * Collateral for a generic SVM VM-exit.
+ */
+static void
+vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
+{
+
+	vme->exitcode = VM_EXITCODE_SVM;
+	vme->u.svm.exitcode = code;
+	vme->u.svm.exitinfo1 = info1;
+	vme->u.svm.exitinfo2 = info2;
+}
+
 static int
 svm_cpl(struct vmcb_state *state)
 {
@@ -1080,6 +1098,76 @@ clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
 }
 
+#define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
+
+static int
+svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
+{
+	struct vm_exit *vme;
+	struct vmcb_state *state;
+	uint64_t changed, lma, oldval;
+	int error;
+
+	state = svm_get_vmcb_state(sc, vcpu);
+
+	oldval = state->efer;
+	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
+
+	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
+	changed = oldval ^ newval;
+
+	if (newval & EFER_MBZ_BITS)
+		goto gpf;
+
+	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
+	if (changed & EFER_LME) {
+		if (state->cr0 & CR0_PG)
+			goto gpf;
+	}
+
+	/* EFER.LMA = EFER.LME & CR0.PG */
+	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
+		lma = EFER_LMA;
+	else
+		lma = 0;
+
+	if ((newval & EFER_LMA) != lma)
+		goto gpf;
+
+	if (newval & EFER_NXE) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
+			goto gpf;
+	}
+
+	/*
+	 * XXX bhyve does not enforce segment limits in 64-bit mode. Until
+	 * this is fixed flag guest attempt to set EFER_LMSLE as an error.
+	 */
+	if (newval & EFER_LMSLE) {
+		vme = vm_exitinfo(sc->vm, vcpu);
+		vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
+		*retu = true;
+		return (0);
+	}
+
+	if (newval & EFER_FFXSR) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
+			goto gpf;
+	}
+
+	if (newval & EFER_TCE) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
+			goto gpf;
+	}
+
+	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
+	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
+	return (0);
+gpf:
+	vm_inject_gp(sc->vm, vcpu);
+	return (0);
+}
+
 static int
 emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
     bool *retu)
@@ -1089,7 +1177,7 @@ emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
 	if (lapic_msr(num))
 		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
 	else if (num == MSR_EFER)
-		error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, val);
+		error = svm_write_efer(sc, vcpu, val, retu);
 	else
 		error = svm_wrmsr(sc, vcpu, num, val, retu);
 
@@ -1189,19 +1277,6 @@ nrip_valid(uint64_t exitcode)
 	}
 }
 
-/*
- * Collateral for a generic SVM VM-exit.
- */
-static void
-vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
-{
-
-	vme->exitcode = VM_EXITCODE_SVM;
-	vme->u.svm.exitcode = code;
-	vme->u.svm.exitinfo1 = info1;
-	vme->u.svm.exitinfo2 = info2;
-}
-
 static int
 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
@@ -1830,7 +1905,7 @@ enable_gintr(void)
  */
 static int
 svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
-	void *rend_cookie, void *suspended_cookie)
+	struct vm_eventinfo *evinfo)
 {
 	struct svm_regctx *gctx;
 	struct svm_softc *svm_sc;
@@ -1905,18 +1980,24 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		 */
 		disable_gintr();
 
-		if (vcpu_suspended(suspended_cookie)) {
+		if (vcpu_suspended(evinfo)) {
 			enable_gintr();
 			vm_exit_suspended(vm, vcpu, state->rip);
 			break;
 		}
 
-		if (vcpu_rendezvous_pending(rend_cookie)) {
+		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_gintr();
 			vm_exit_rendezvous(vm, vcpu, state->rip);
 			break;
 		}
 
+		if (vcpu_reqidle(evinfo)) {
+			enable_gintr();
+			vm_exit_reqidle(vm, vcpu, state->rip);
+			break;
+		}
+
 		/* We are asked to give the cpu by scheduler. */
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_gintr();
diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c
index 100af4b..088751a 100644
--- a/sys/amd64/vmm/amd/svm_msr.c
+++ b/sys/amd64/vmm/amd/svm_msr.c
@@ -27,12 +27,17 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/errno.h>
+#include <sys/systm.h>
 
 #include <machine/cpufunc.h>
 #include <machine/specialreg.h>
+#include <machine/vmm.h>
 
+#include "svm.h"
+#include "vmcb.h"
+#include "svm_softc.h"
 #include "svm_msr.h"
 
 #ifndef MSR_AMDK8_IPM
@@ -105,6 +110,18 @@ svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
 	int error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		*result = 0;
+		break;
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+		*result = 0;
+		break;
 	case MSR_AMDK8_IPM:
 		*result = 0;
 		break;
@@ -122,6 +139,18 @@ svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
 	int error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		break;		/* ignore writes */
+	case MSR_MTRRcap:
+		vm_inject_gp(sc->vm, vcpu);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+		break;		/* Ignore writes */
 	case MSR_AMDK8_IPM:
 		/*
 		 * Ignore writes to the "Interrupt Pending Message" MSR.
diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c
index fb4b2c8..d860169 100644
--- a/sys/amd64/vmm/amd/vmcb.c
+++ b/sys/amd64/vmm/amd/vmcb.c
@@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <machine/segments.h>
 #include <machine/specialreg.h>
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 9aa55e2..5bbfe99 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -857,10 +857,11 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
-	 * The TSC MSR is exposed read-only. Writes are disallowed as that
-	 * will impact the host TSC.
-	 * XXX Writes would be implemented with a wrmsr trap, and
-	 * then modifying the TSC offset in the VMCS.
+	 * The TSC MSR is exposed read-only. Writes are disallowed as
+	 * that will impact the host TSC.  If the guest does a write
+	 * the "use TSC offsetting" execution control is enabled and the
+	 * difference between the host TSC and the guest TSC is written
+	 * into the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
@@ -1131,6 +1132,22 @@ vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
+int
+vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
+{
+	int error;
+
+	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
+		vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
+		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+		VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
+	}
+
+	error = vmwrite(VMCS_TSC_OFFSET, offset);
+
+	return (error);
+}
+
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
@@ -1781,6 +1798,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+	vmexit->inst_length = 0;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
@@ -2554,7 +2572,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 
 static int
 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
-    void *rendezvous_cookie, void *suspend_cookie)
+    struct vm_eventinfo *evinfo)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
@@ -2623,18 +2641,24 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
-		if (vcpu_suspended(suspend_cookie)) {
+		if (vcpu_suspended(evinfo)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
-		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
+		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
+		if (vcpu_reqidle(evinfo)) {
+			enable_intr();
+			vm_exit_reqidle(vmx->vm, vcpu, rip);
+			break;
+		}
+
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, rip);
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index bc48861..57f5b28 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -135,6 +135,8 @@ void	vmx_call_isr(uintptr_t entry);
 u_long	vmx_fix_cr0(u_long cr0);
 u_long	vmx_fix_cr4(u_long cr4);
 
+int	vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset);
+
 extern char	vmx_exit_guest[];
 
 #endif
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index e517778..91b2c01 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
@@ -396,6 +395,17 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 	error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		*val = 0;
+		break;
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		*val = 0;
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		*val = misc_enable;
 		break;
@@ -427,6 +437,17 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 	error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		break;		/* ignore writes */
+	case MSR_MTRRcap:
+		vm_inject_gp(vmx->vm, vcpuid);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		break;		/* Ignore writes */
 	case MSR_IA32_MISC_ENABLE:
 		changed = val ^ misc_enable;
 		/*
@@ -453,6 +474,9 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 		else
 			vm_inject_gp(vmx->vm, vcpuid);
 		break;
+	case MSR_TSC:
+		error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc());
+		break;
 	default:
 		error = EINVAL;
 		break;
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index 0df6e7c..6e94f5b 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c
index 842253d..173ef1f 100644
--- a/sys/amd64/vmm/io/vatpit.c
+++ b/sys/amd64/vmm/io/vatpit.c
@@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index a4c96cd..1db1c51 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -36,7 +36,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <dev/acpica/acpi_hpet.h>
 
diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c
index 411887d..e6b8b5a 100644
--- a/sys/amd64/vmm/io/vioapic.c
+++ b/sys/amd64/vmm/io/vioapic.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index b192735..039491f 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -548,6 +548,8 @@ vlapic_update_ppr(struct vlapic *vlapic)
 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
 }
 
+static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
+
 static void
 vlapic_process_eoi(struct vlapic *vlapic)
 {
@@ -558,11 +560,7 @@ vlapic_process_eoi(struct vlapic *vlapic)
 	isrptr = &lapic->isr0;
 	tmrptr = &lapic->tmr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		bitpos = fls(isrptr[idx]);
 		if (bitpos-- != 0) {
@@ -571,17 +569,21 @@ vlapic_process_eoi(struct vlapic *vlapic)
 				      vlapic->isrvec_stk_top);
 			}
 			isrptr[idx] &= ~(1 << bitpos);
+			vector = i * 32 + bitpos;
+			VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d",
+			    vector);
 			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
 			vlapic->isrvec_stk_top--;
 			vlapic_update_ppr(vlapic);
 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
-				vector = i * 32 + bitpos;
 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
 				    vector);
 			}
 			return;
 		}
 	}
+	VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI");
+	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
 }
 
 static __inline int
@@ -1093,11 +1095,7 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
 
 	irrptr = &lapic->irr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		val = atomic_load_acq_int(&irrptr[idx]);
 		bitpos = fls(val);
diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c
index 09f763f..1e7bb93 100644
--- a/sys/amd64/vmm/io/vpmtmr.c
+++ b/sys/amd64/vmm/io/vpmtmr.c
@@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c
index 9d406c1..18ebc4b 100644
--- a/sys/amd64/vmm/io/vrtc.c
+++ b/sys/amd64/vmm/io/vrtc.c
@@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
@@ -142,20 +141,23 @@ update_enabled(struct vrtc *vrtc)
 }
 
 static time_t
-vrtc_curtime(struct vrtc *vrtc)
+vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime)
 {
 	sbintime_t now, delta;
-	time_t t;
+	time_t t, secs;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	t = vrtc->base_rtctime;
+	*basetime = vrtc->base_uptime;
 	if (update_enabled(vrtc)) {
 		now = sbinuptime();
 		delta = now - vrtc->base_uptime;
 		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
 		    "%#lx to %#lx", vrtc->base_uptime, now));
-		t += delta / SBT_1S;
+		secs = delta / SBT_1S;
+		t += secs;
+		*basetime += secs * SBT_1S;
 	}
 	return (t);
 }
@@ -390,9 +392,10 @@ fail:
 }
 
 static int
-vrtc_time_update(struct vrtc *vrtc, time_t newtime)
+vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase)
 {
 	struct rtcdev *rtc;
+	sbintime_t oldbase;
 	time_t oldtime;
 	uint8_t alarm_sec, alarm_min, alarm_hour;
 
@@ -404,16 +407,21 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime)
 	alarm_hour = rtc->alarm_hour;
 
 	oldtime = vrtc->base_rtctime;
-	VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx",
+	VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx",
 	    oldtime, newtime);
 
+	oldbase = vrtc->base_uptime;
+	VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx",
+	    oldbase, newbase);
+	vrtc->base_uptime = newbase;
+
 	if (newtime == oldtime)
 		return (0);
 
 	/*
 	 * If 'newtime' indicates that RTC updates are disabled then just
 	 * record that and return. There is no need to do alarm interrupt
-	 * processing or update 'base_uptime' in this case.
+	 * processing in this case.
 	 */
 	if (newtime == VRTC_BROKEN_TIME) {
 		vrtc->base_rtctime = VRTC_BROKEN_TIME;
@@ -459,8 +467,6 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime)
 	if (uintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
 
-	vrtc->base_uptime = sbinuptime();
-
 	return (0);
 }
 
@@ -531,7 +537,7 @@ static void
 vrtc_callout_handler(void *arg)
 {
 	struct vrtc *vrtc = arg;
-	sbintime_t freqsbt;
+	sbintime_t freqsbt, basetime;
 	time_t rtctime;
 	int error;
 
@@ -553,8 +559,8 @@ vrtc_callout_handler(void *arg)
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
 
 	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
-		rtctime = vrtc_curtime(vrtc);
-		error = vrtc_time_update(vrtc, rtctime);
+		rtctime = vrtc_curtime(vrtc, &basetime);
+		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
 		    __func__, error));
 	}
@@ -619,7 +625,7 @@ static int
 vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
-	sbintime_t oldfreq, newfreq;
+	sbintime_t oldfreq, newfreq, basetime;
 	time_t curtime, rtctime;
 	int error;
 	uint8_t oldval, changed;
@@ -640,12 +646,13 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 	if (changed & RTCSB_HALT) {
 		if ((newval & RTCSB_HALT) == 0) {
 			rtctime = rtc_to_secs(vrtc);
+			basetime = sbinuptime();
 			if (rtctime == VRTC_BROKEN_TIME) {
 				if (rtc_flag_broken_time)
 					return (-1);
 			}
 		} else {
-			curtime = vrtc_curtime(vrtc);
+			curtime = vrtc_curtime(vrtc, &basetime);
 			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
 			    "between vrtc basetime (%#lx) and curtime (%#lx)",
 			    __func__, vrtc->base_rtctime, curtime));
@@ -664,7 +671,7 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 			rtctime = VRTC_BROKEN_TIME;
 			rtc->reg_b &= ~RTCSB_UINTR;
 		}
-		error = vrtc_time_update(vrtc, rtctime);
+		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("vrtc_time_update error %d", error));
 	}
 
@@ -744,7 +751,7 @@ vrtc_set_time(struct vm *vm, time_t secs)
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
-	error = vrtc_time_update(vrtc, secs);
+	error = vrtc_time_update(vrtc, secs, sbinuptime());
 	VRTC_UNLOCK(vrtc);
 
 	if (error) {
@@ -761,11 +768,12 @@ time_t
 vrtc_get_time(struct vm *vm)
 {
 	struct vrtc *vrtc;
+	sbintime_t basetime;
 	time_t t;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
-	t = vrtc_curtime(vrtc);
+	t = vrtc_curtime(vrtc, &basetime);
 	VRTC_UNLOCK(vrtc);
 
 	return (t);
@@ -802,6 +810,7 @@ int
 vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 {
 	struct vrtc *vrtc;
+	sbintime_t basetime;
 	time_t curtime;
 	uint8_t *ptr;
 
@@ -818,7 +827,7 @@ vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 	 * Update RTC date/time fields if necessary.
 	 */
 	if (offset < 10 || offset == RTC_CENTURY) {
-		curtime = vrtc_curtime(vrtc);
+		curtime = vrtc_curtime(vrtc, &basetime);
 		secs_to_rtc(curtime, vrtc, 0);
 	}
 
@@ -858,6 +867,7 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
+	sbintime_t basetime;
 	time_t curtime;
 	int error, offset;
 
@@ -875,8 +885,8 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 	}
 
 	error = 0;
-	curtime = vrtc_curtime(vrtc);
-	vrtc_time_update(vrtc, curtime);
+	curtime = vrtc_curtime(vrtc, &basetime);
+	vrtc_time_update(vrtc, curtime, basetime);
 
 	/*
 	 * Update RTC date/time fields if necessary.
@@ -939,7 +949,7 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 		 */
 		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
 			curtime = rtc_to_secs(vrtc);
-			error = vrtc_time_update(vrtc, curtime);
+			error = vrtc_time_update(vrtc, curtime, sbinuptime());
 			KASSERT(!error, ("vrtc_time_update error %d", error));
 			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
 				error = -1;
@@ -993,7 +1003,7 @@ vrtc_init(struct vm *vm)
 
 	VRTC_LOCK(vrtc);
 	vrtc->base_rtctime = VRTC_BROKEN_TIME;
-	vrtc_time_update(vrtc, curtime);
+	vrtc_time_update(vrtc, curtime, sbinuptime());
 	secs_to_rtc(curtime, vrtc, 0);
 	VRTC_UNLOCK(vrtc);
 
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 0e78272..c3c7bb1 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -96,6 +96,7 @@ struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
+	int		reqidle;	/* (i) request vcpu to idle */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
@@ -165,8 +166,8 @@ static struct vmm_ops *ops;
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
-#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
-	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
+#define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
@@ -223,6 +224,28 @@ TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu);
 SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0,
     "Force use of I/O MMU even if no passthrough devices were found.");
 
+static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
+
+#ifdef KTR
+static const char *
+vcpu_state2str(enum vcpu_state state)
+{
+
+	switch (state) {
+	case VCPU_IDLE:
+		return ("idle");
+	case VCPU_FROZEN:
+		return ("frozen");
+	case VCPU_RUNNING:
+		return ("running");
+	case VCPU_SLEEPING:
+		return ("sleeping");
+	default:
+		return ("unknown");
+	}
+}
+#endif
+
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
@@ -257,6 +280,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
@@ -982,11 +1006,13 @@ save_guest_fpustate(struct vcpu *vcpu)
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
-vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
+vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
+	struct vcpu *vcpu;
 	int error;
 
+	vcpu = &vm->vcpu[vcpuid];
 	vcpu_assert_locked(vcpu);
 
 	/*
@@ -995,8 +1021,13 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
-		while (vcpu->state != VCPU_IDLE)
+		while (vcpu->state != VCPU_IDLE) {
+			vcpu->reqidle = 1;
+			vcpu_notify_event_locked(vcpu, false);
+			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
+			    "idle requested", vcpu_state2str(vcpu->state));
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
@@ -1033,6 +1064,9 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	if (error)
 		return (EBUSY);
 
+	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
+	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
+
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
@@ -1055,11 +1089,11 @@ vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 }
 
 static void
-vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
-	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
+	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
@@ -1147,7 +1181,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
-		if (vm->rendezvous_func != NULL || vm->suspend)
+		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
@@ -1184,13 +1218,13 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 		}
 
 		t = ticks;
-		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
-		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
 
@@ -1258,11 +1292,14 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
-	int cs_d, error, length;
+	int cs_d, error, fault;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
+	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
+	    __func__, vme->inst_length));
+
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_base = vme->u.inst_emul.cs_base;
@@ -1275,37 +1312,31 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
-		/*
-		 * If the instruction length is not known then assume a
-		 * maximum size instruction.
-		 */
-		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
-		    cs_base, length, vie);
+		    cs_base, VIE_INST_SIZE, vie, &fault);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
-		error = 0;
+		error = fault = 0;
 	}
-	if (error == 1)
-		return (0);		/* Resume guest to handle page fault */
-	else if (error == -1)
-		return (EFAULT);
-	else if (error != 0)
-		panic("%s: vmm_fetch_instruction error %d", __func__, error);
+	if (error || fault)
+		return (error);
 
-	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
-		return (EFAULT);
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
+		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
+		    vme->rip + cs_base);
+		*retu = true;	    /* dump instruction bytes in userspace */
+		return (0);
+	}
 
 	/*
-	 * If the instruction length was not specified then update it now
-	 * along with 'nextrip'.
+	 * Update 'nextrip' based on the length of the emulated instruction.
 	 */
-	if (vme->inst_length == 0) {
-		vme->inst_length = vie->num_processed;
-		vcpu->nextrip += vie->num_processed;
-	}
+	vme->inst_length = vie->num_processed;
+	vcpu->nextrip += vie->num_processed;
+	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
+	    "decoding", vcpu->nextrip);
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
@@ -1355,9 +1386,9 @@ vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
-			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
-			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
@@ -1380,6 +1411,19 @@ vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 	return (0);
 }
 
+static int
+vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
+	vcpu->reqidle = 0;
+	vcpu_unlock(vcpu);
+	*retu = true;
+	return (0);
+}
+
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
@@ -1437,6 +1481,18 @@ vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 }
 
 void
+vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	struct vm_exit *vmexit;
+
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->rip = rip;
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_REQIDLE;
+	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
+}
+
+void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
@@ -1451,6 +1507,7 @@ vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
+	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
@@ -1458,7 +1515,6 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
-	void *rptr, *sptr;
 
 	vcpuid = vmrun->cpuid;
 
@@ -1471,11 +1527,12 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
-	rptr = &vm->rendezvous_func;
-	sptr = &vm->suspend;
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
+	evinfo.rptr = &vm->rendezvous_func;
+	evinfo.sptr = &vm->suspend;
+	evinfo.iptr = &vcpu->reqidle;
 restart:
 	critical_enter();
 
@@ -1490,7 +1547,7 @@ restart:
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
-	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr);
+	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
@@ -1503,6 +1560,9 @@ restart:
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
+		case VM_EXITCODE_REQIDLE:
+			error = vm_handle_reqidle(vm, vcpuid, &retu);
+			break;
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
@@ -1541,6 +1601,8 @@ restart:
 	if (error == 0 && retu == false)
 		goto restart;
 
+	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
+
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
@@ -1790,6 +1852,7 @@ vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
+	uint64_t regval;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
@@ -1814,6 +1877,16 @@ vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
 		return (EBUSY);
 	}
 
+	if (errcode_valid) {
+		/*
+		 * Exceptions don't deliver an error code in real mode.
+		 */
+		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
+		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
+		if (!(regval & CR0_PE))
+			errcode_valid = 0;
+	}
+
 	/*
 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
 	 *
@@ -2066,7 +2139,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
-	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
+	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
@@ -2162,15 +2235,11 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
-void
-vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+static void
+vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 {
 	int hostcpu;
-	struct vcpu *vcpu;
 
-	vcpu = &vm->vcpu[vcpuid];
-
-	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
@@ -2195,6 +2264,15 @@ vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
+}
+
+void
+vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	vcpu_notify_event_locked(vcpu, lapic_intr);
 	vcpu_unlock(vcpu);
 }
 
@@ -2321,7 +2399,7 @@ vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
-    int num_copyinfo)
+    int num_copyinfo, int *fault)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
@@ -2334,8 +2412,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
-		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
-		if (error)
+		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
+		if (error || *fault)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
@@ -2357,8 +2435,9 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
-		return (-1);
+		return (EFAULT);
 	} else {
+		*fault = 0;
 		return (0);
 	}
 }
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 5be99cb..e3e140a 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -441,19 +441,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
 		gg = (struct vm_gla2gpa *)data;
 		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
-		    gg->prot, &gg->gpa);
-		KASSERT(error == 0 || error == 1 || error == -1,
+		    gg->prot, &gg->gpa, &gg->fault);
+		KASSERT(error == 0 || error == EFAULT,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
-		if (error >= 0) {
-			/*
-			 * error = 0: the translation was successful
-			 * error = 1: a fault was injected into the guest
-			 */
-			gg->fault = error;
-			error = 0;
-		} else {
-			error = EFAULT;
-		}
 		break;
 	}
 	case VM_ACTIVATE_CPU:
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 6f75515..758b7e8 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -73,6 +73,7 @@ enum {
 	VIE_OP_TYPE_MOVS,
 	VIE_OP_TYPE_GROUP1,
 	VIE_OP_TYPE_STOS,
+	VIE_OP_TYPE_BITTEST,
 	VIE_OP_TYPE_LAST
 };
 
@@ -92,6 +93,11 @@ static const struct vie_op two_byte_opcodes[256] = {
 		.op_byte = 0xB7,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xBA] = {
+		.op_byte = 0xBA,
+		.op_type = VIE_OP_TYPE_BITTEST,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -107,6 +113,10 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x2B,
 		.op_type = VIE_OP_TYPE_SUB,
 	},
+	[0x39] = {
+		.op_byte = 0x39,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
 	[0x3B] = {
 		.op_byte = 0x3B,
 		.op_type = VIE_OP_TYPE_CMP,
@@ -172,14 +182,20 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x23,
 		.op_type = VIE_OP_TYPE_AND,
 	},
+	[0x80] = {
+		/* Group 1 extended opcode */
+		.op_byte = 0x80,
+		.op_type = VIE_OP_TYPE_GROUP1,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0x81] = {
-		/* XXX Group 1 extended opcode */
+		/* Group 1 extended opcode */
 		.op_byte = 0x81,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x83] = {
-		/* XXX Group 1 extended opcode */
+		/* Group 1 extended opcode */
 		.op_byte = 0x83,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM8,
@@ -585,13 +601,11 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 /*
  * Helper function to calculate and validate a linear address.
- *
- * Returns 0 on success and 1 if an exception was injected into the guest.
  */
 static int
 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
     int opsize, int addrsize, int prot, enum vm_reg_name seg,
-    enum vm_reg_name gpr, uint64_t *gla)
+    enum vm_reg_name gpr, uint64_t *gla, int *fault)
 {
 	struct seg_desc desc;
 	uint64_t cr0, val, rflags;
@@ -617,7 +631,7 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
-		return (1);
+		goto guest_fault;
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
@@ -625,14 +639,19 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
-		return (1);
+		goto guest_fault;
 	}
 
 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
-		return (1);
+		goto guest_fault;
 	}
 
+	*fault = 0;
+	return (0);
+
+guest_fault:
+	*fault = 1;
 	return (0);
 }
 
@@ -648,7 +667,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 #endif
 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
 	uint64_t rcx, rdi, rsi, rflags;
-	int error, opsize, seg, repeat;
+	int error, fault, opsize, seg, repeat;
 
 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
 	val = 0;
@@ -671,8 +690,10 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * The count register is %rcx, %ecx or %cx depending on the
 		 * address size of the instruction.
 		 */
-		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
-			return (0);
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
+			error = 0;
+			goto done;
+		}
 	}
 
 	/*
@@ -693,13 +714,16 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
-	if (error)
+	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
+	if (error || fault)
 		goto done;
 
 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
-	    copyinfo, nitems(copyinfo));
+	    copyinfo, nitems(copyinfo), &fault);
 	if (error == 0) {
+		if (fault)
+			goto done;	/* Resume guest to handle fault */
+
 		/*
 		 * case (2): read from system memory and write to mmio.
 		 */
@@ -708,11 +732,6 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
 		if (error)
 			goto done;
-	} else if (error > 0) {
-		/*
-		 * Resume guest execution to handle fault.
-		 */
-		goto done;
 	} else {
 		/*
 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
@@ -720,13 +739,17 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 */
 
 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
-		if (error)
+		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
+		    &fault);
+		if (error || fault)
 			goto done;
 
 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
-		    PROT_WRITE, copyinfo, nitems(copyinfo));
+		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
 		if (error == 0) {
+			if (fault)
+				goto done;    /* Resume guest to handle fault */
+
 			/*
 			 * case (3): read from MMIO and write to system memory.
 			 *
@@ -742,27 +765,29 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-		} else if (error > 0) {
-			/*
-			 * Resume guest execution to handle fault.
-			 */
-			goto done;
 		} else {
 			/*
 			 * Case (4): read from and write to mmio.
+			 *
+			 * Commit to the MMIO read/write (with potential
+			 * side-effects) only after we are sure that the
+			 * instruction is not going to be restarted due
+			 * to address translation faults.
 			 */
 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
-			    PROT_READ, &srcgpa);
-			if (error)
-				goto done;
-			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
-			if (error)
+			    PROT_READ, &srcgpa, &fault);
+			if (error || fault)
 				goto done;
 
 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
-			   PROT_WRITE, &dstgpa);
+			   PROT_WRITE, &dstgpa, &fault);
+			if (error || fault)
+				goto done;
+
+			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
 			if (error)
 				goto done;
+
 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
 			if (error)
 				goto done;
@@ -807,10 +832,9 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 			vm_restart_instruction(vm, vcpuid);
 	}
 done:
-	if (error < 0)
-		return (EFAULT);
-	else
-		return (0);
+	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
+	    __func__, error));
+	return (error);
 }
 
 static int
@@ -1030,39 +1054,55 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
-	uint64_t op1, op2, rflags, rflags2;
+	uint64_t regop, memop, op1, op2, rflags, rflags2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	switch (vie->op.op_byte) {
+	case 0x39:
 	case 0x3B:
 		/*
+		 * 39/r		CMP r/m16, r16
+		 * 39/r		CMP r/m32, r32
+		 * REX.W 39/r	CMP r/m64, r64
+		 *
 		 * 3B/r		CMP r16, r/m16
 		 * 3B/r		CMP r32, r/m32
 		 * REX.W + 3B/r	CMP r64, r/m64
 		 *
-		 * Compare first operand (reg) with second operand (r/m) and
+		 * Compare the first operand with the second operand and
 		 * set status flags in EFLAGS register. The comparison is
 		 * performed by subtracting the second operand from the first
 		 * operand and then setting the status flags.
 		 */
 
-		/* Get the first operand */
+		/* Get the register operand */
 		reg = gpr_map[vie->reg];
-		error = vie_read_register(vm, vcpuid, reg, &op1);
+		error = vie_read_register(vm, vcpuid, reg, &regop);
 		if (error)
 			return (error);
 
-		/* Get the second operand */
-		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		/* Get the memory operand */
+		error = memread(vm, vcpuid, gpa, &memop, size, arg);
 		if (error)
 			return (error);
 
+		if (vie->op.op_byte == 0x3B) {
+			op1 = regop;
+			op2 = memop;
+		} else {
+			op1 = memop;
+			op2 = regop;
+		}
 		rflags2 = getcc(size, op1, op2);
 		break;
+	case 0x80:
 	case 0x81:
 	case 0x83:
 		/*
+		 * 80 /7		cmp r/m8, imm8
+		 * REX + 80 /7		cmp r/m8, imm8
+		 *
 		 * 81 /7		cmp r/m16, imm16
 		 * 81 /7		cmp r/m32, imm32
 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
@@ -1078,6 +1118,8 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * the status flags.
 		 *
 		 */
+		if (vie->op.op_byte == 0x80)
+			size = 1;
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
@@ -1167,7 +1209,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 #endif
 	struct seg_desc ss_desc;
 	uint64_t cr0, rflags, rsp, stack_gla, val;
-	int error, size, stackaddrsize, pushop;
+	int error, fault, size, stackaddrsize, pushop;
 
 	val = 0;
 	size = vie->opsize;
@@ -1233,18 +1275,10 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 	}
 
 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
-	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
-	if (error == -1) {
-		/*
-		 * XXX cannot return a negative error value here because it
-		 * ends up being the return value of the VM_RUN() ioctl and
-		 * is interpreted as a pseudo-error (for e.g. ERESTART).
-		 */
-		return (EFAULT);
-	} else if (error == 1) {
-		/* Resume guest execution to handle page fault */
-		return (0);
-	}
+	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
+	    &fault);
+	if (error || fault)
+		return (error);
 
 	if (pushop) {
 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
@@ -1335,6 +1369,48 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+static int
+emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
+{
+	uint64_t val, rflags;
+	int error, bitmask, bitoff;
+
+	/*
+	 * 0F BA is a Group 8 extended opcode.
+	 *
+	 * Currently we only emulate the 'Bit Test' instruction which is
+	 * identified by a ModR/M:reg encoding of 100b.
+	 */
+	if ((vie->reg & 7) != 4)
+		return (EINVAL);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
+	if (error)
+		return (error);
+
+	/*
+	 * Intel SDM, Vol 2, Table 3-2:
+	 * "Range of Bit Positions Specified by Bit Offset Operands"
+	 */
+	bitmask = vie->opsize * 8 - 1;
+	bitoff = vie->immediate & bitmask;
+
+	/* Copy the bit into the Carry flag in %rflags */
+	if (val & (1UL << bitoff))
+		rflags |= PSL_C;
+	else
+		rflags &= ~PSL_C;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
+
+	return (0);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
@@ -1391,6 +1467,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_sub(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_BITTEST:
+		error = emulate_bittest(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
 	default:
 		error = EINVAL;
 		break;
@@ -1608,7 +1688,7 @@ ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
 
 int
 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa)
+    uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
 {
 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
 	u_int retries;
@@ -1616,6 +1696,8 @@ vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 	uint32_t *ptpbase32, pte32;
 	void *cookie;
 
+	*guest_fault = 0;
+
 	usermode = (paging->cpl == 3 ? 1 : 0);
 	writable = prot & VM_PROT_WRITE;
 	cookie = NULL;
@@ -1778,18 +1860,20 @@ restart:
 	*gpa = pte | (gla & (pgsize - 1));
 done:
 	ptp_release(&cookie);
+	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
+	    __func__, retval));
 	return (retval);
 error:
-	retval = -1;
+	retval = EFAULT;
 	goto done;
 fault:
-	retval = 1;
+	*guest_fault = 1;
 	goto done;
 }
 
 int
 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t rip, int inst_length, struct vie *vie)
+    uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
 {
 	struct vm_copyinfo copyinfo[2];
 	int error, prot;
@@ -1799,13 +1883,14 @@ vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 
 	prot = PROT_READ | PROT_EXEC;
 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
-	    copyinfo, nitems(copyinfo));
-	if (error == 0) {
-		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
-		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-		vie->num_valid = inst_length;
-	}
-	return (error);
+	    copyinfo, nitems(copyinfo), faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+	vie->num_valid = inst_length;
+	return (0);
 }
 
 static int
@@ -2230,19 +2315,6 @@ decode_moffset(struct vie *vie)
 }
 
 /*
- * Verify that all the bytes in the instruction buffer were consumed.
- */
-static int
-verify_inst_length(struct vie *vie)
-{
-
-	if (vie->num_processed)
-		return (0);
-	else
-		return (-1);
-}
-
-/*
  * Verify that the 'guest linear address' provided as collateral of the nested
  * page table fault matches with our instruction decoding.
  */
@@ -2270,7 +2342,7 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 		 * instruction
 		 */
 		if (vie->base_register == VM_REG_GUEST_RIP)
-			base += vie->num_valid;
+			base += vie->num_processed;
 	}
 
 	idx = 0;
@@ -2323,9 +2395,6 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 	if (decode_moffset(vie))
 		return (-1);
 
-	if (verify_inst_length(vie))
-		return (-1);
-
 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
 		if (verify_gla(vm, cpuid, gla, vie))
 			return (-1);
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index fc68a61..63044e8 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -28,16 +28,10 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/types.h>
-#include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/systm.h>
 
-#include <vm/vm.h>
-
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
-#include <x86/psl.h>
 
 #include "vatpic.h"
 #include "vatpit.h"
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
index 9ecf9af..7e2f64d 100644
--- a/sys/amd64/vmm/vmm_stat.c
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -33,7 +33,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
-#include <sys/smp.h>
 
 #include <machine/vmm.h>
 #include "vmm_util.h"
@@ -165,6 +164,7 @@ VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault");
 VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation");
 VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
 VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
+VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit");
 VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
 VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit");
 VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions");
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
index 1640ba3..c695840 100644
--- a/sys/amd64/vmm/vmm_stat.h
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -157,4 +157,5 @@ VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
 VMM_STAT_DECLARE(VMEXIT_USERSPACE);
 VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
 VMM_STAT_DECLARE(VMEXIT_EXCEPTION);
+VMM_STAT_DECLARE(VMEXIT_REQIDLE);
 #endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index c37d21c..525e1d9 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
@@ -231,10 +230,11 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
-			 * Don't expose VMX, SpeedStep or TME capability.
+			 * Don't expose VMX, SpeedStep, TME or SMX capability.
 			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+			regs[2] &= ~(CPUID2_SMX);
 
 			regs[2] |= CPUID2_HV;
 
@@ -286,18 +286,20 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
-			
+
 			/*
-			 * Machine check handling is done in the host.
-			 * Hide MTRR capability.
+			 * Hide the debug store capability.
 			 */
-			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
-
-                        /*
-                        * Hide the debug store capability.
-                        */
 			regs[3] &= ~CPUID_DS;
 
+			/*
+			 * Advertise the Machine Check and MTRR capability.
+			 *
+			 * Some guest OSes (e.g. Windows) will not boot if
+			 * these features are absent.
+			 */
+			regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
 			logical_cpus = threads_per_core * cores_per_package;
 			regs[1] &= ~CPUID_HTT_CORES;
 			regs[1] |= (logical_cpus & 0xff) << 16;
@@ -486,3 +488,34 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 
 	return (1);
 }
+
+bool
+vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
+{
+	bool rv;
+
+	KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
+	    __func__, cap));
+
+	/*
+	 * Simply passthrough the capabilities of the host cpu for now.
+	 */
+	rv = false;
+	switch (cap) {
+	case VCC_NO_EXECUTE:
+		if (amd_feature & AMDID_NX)
+			rv = true;
+		break;
+	case VCC_FFXSR:
+		if (amd_feature & AMDID_FFXSR)
+			rv = true;
+		break;
+	case VCC_TCE:
+		if (amd_feature2 & AMDID2_TCE)
+			rv = true;
+		break;
+	default:
+		panic("%s: unknown vm_cpu_capability %d", __func__, cap);
+	}
+	return (rv);
+}
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
index 8401c15..6f99d52 100644
--- a/sys/amd64/vmm/x86.h
+++ b/sys/amd64/vmm/x86.h
@@ -62,4 +62,17 @@
 int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
 		      uint32_t *ecx, uint32_t *edx);
 
+enum vm_cpuid_capability {
+	VCC_NONE,
+	VCC_NO_EXECUTE,
+	VCC_FFXSR,
+	VCC_TCE,
+	VCC_LAST
+};
+
+/*
+ * Return 'true' if the capability 'cap' is enabled in this virtual cpu
+ * and 'false' otherwise.
+ */
+bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability);
 #endif
diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h
index e6079e7..1ba74f9 100644
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@@ -82,6 +82,9 @@
 #define	EFER_LMA 0x000000400	/* Long mode active (R) */
 #define	EFER_NXE 0x000000800	/* PTE No-Execute bit enable (R/W) */
 #define	EFER_SVM 0x000001000	/* SVM enable bit for AMD, reserved for Intel */
+#define	EFER_LMSLE 0x000002000	/* Long Mode Segment Limit Enable */
+#define	EFER_FFXSR 0x000004000	/* Fast FXSAVE/FSRSTOR */
+#define	EFER_TCE   0x000008000	/* Translation Cache Extension */
 
 /*
  * Intel Extended Features registers
@@ -190,7 +193,7 @@
 #define	AMDID_MP	0x00080000
 #define	AMDID_NX	0x00100000
 #define	AMDID_EXT_MMX	0x00400000
-#define	AMDID_FFXSR	0x01000000
+#define	AMDID_FFXSR	0x02000000
 #define	AMDID_PAGE1GB	0x04000000
 #define	AMDID_RDTSCP	0x08000000
 #define	AMDID_LM	0x20000000
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index e15f9ac..6294e67 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -193,8 +193,13 @@ format.
 .Pp
 Block storage devices:
 .Bl -tag -width 10n
-.It Pa /filename Ns Oo , Ns Li nocache Oc Ns Oo , Ns Li direct Oc Ns Oo , Ns Li ro Oc
-.It Pa /dev/xxx Ns Oo , Ns Ar nocache Oc Ns Oo , Ns Ar direct Oc Ns Oo , Ns Ar ro Oc
+.It Pa /filename Ns Oo , Ns Ar block-device-options Oc
+.It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc
+.El
+.Pp
+The
+.Ar block-device-options
+are:
 .Bl -tag -width 8n
 .It Li nocache
 Open the file with
@@ -204,14 +209,10 @@ Open the file using
 .Dv O_SYNC .
 .It Li ro
 Force the file to be opened read-only.
-.El
-.Pp
-The
-.Li nocache ,
-.Li direct ,
-and
-.Li ro
-options are not available for virtio block devices.
+.It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc
+Specify the logical and physical sector sizes of the emulated disk.
+The physical sector size is optional and is equal to the logical sector size
+if not explicitly specified.
 .El
 .Pp
 TTY devices:
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 47a7699..ee0f106 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -100,7 +100,7 @@ static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
         uint64_t        vmexit_bogus;
-        uint64_t        vmexit_bogus_switch;
+	uint64_t	vmexit_reqidle;
         uint64_t        vmexit_hlt;
         uint64_t        vmexit_pause;
         uint64_t        vmexit_mtrap;
@@ -461,6 +461,17 @@ vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 }
 
 static int
+vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	assert(vmexit->inst_length == 0);
+
+	stats.vmexit_reqidle++;
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
@@ -571,6 +582,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_VMX]    = vmexit_vmx,
 	[VM_EXITCODE_SVM]    = vmexit_svm,
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
+	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index bcb1617..ef8e11e 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -392,16 +392,18 @@ blockif_open(const char *optstr, const char *ident)
 {
 	char tname[MAXCOMLEN + 1];
 	char name[MAXPATHLEN];
-	char *nopt, *xopts;
+	char *nopt, *xopts, *cp;
 	struct blockif_ctxt *bc;
 	struct stat sbuf;
 	struct diocgattr_arg arg;
 	off_t size, psectsz, psectoff;
 	int extra, fd, i, sectsz;
-	int nocache, sync, ro, candelete, geom;
+	int nocache, sync, ro, candelete, geom, ssopt, pssopt;
 
 	pthread_once(&blockif_once, blockif_init);
 
+	fd = -1;
+	ssopt = 0;
 	nocache = 0;
 	sync = 0;
 	ro = 0;
@@ -410,16 +412,25 @@ blockif_open(const char *optstr, const char *ident)
 	 * The first element in the optstring is always a pathname.
 	 * Optional elements follow
 	 */
-	nopt = strdup(optstr);
-	for (xopts = strtok(nopt, ",");
-	     xopts != NULL;
-	     xopts = strtok(NULL, ",")) {
-		if (!strcmp(xopts, "nocache"))
+	nopt = xopts = strdup(optstr);
+	while (xopts != NULL) {
+		cp = strsep(&xopts, ",");
+		if (cp == nopt)		/* file or device pathname */
+			continue;
+		else if (!strcmp(cp, "nocache"))
 			nocache = 1;
-		else if (!strcmp(xopts, "sync"))
+		else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
 			sync = 1;
-		else if (!strcmp(xopts, "ro"))
+		else if (!strcmp(cp, "ro"))
 			ro = 1;
+		else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
+			;
+		else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
+			pssopt = ssopt;
+		else {
+			fprintf(stderr, "Invalid device option \"%s\"\n", cp);
+			goto err;
+		}
 	}
 
 	extra = 0;
@@ -437,13 +448,12 @@ blockif_open(const char *optstr, const char *ident)
 
 	if (fd < 0) {
 		perror("Could not open backing file");
-		return (NULL);
+		goto err;
 	}
 
         if (fstat(fd, &sbuf) < 0) {
                 perror("Could not stat backing file");
-                close(fd);
-                return (NULL);
+		goto err;
         }
 
         /*
@@ -457,8 +467,7 @@ blockif_open(const char *optstr, const char *ident)
 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
 			perror("Could not fetch dev blk/sector size");
-			close(fd);
-			return (NULL);
+			goto err;
 		}
 		assert(size != 0);
 		assert(sectsz != 0);
@@ -473,10 +482,39 @@ blockif_open(const char *optstr, const char *ident)
 	} else
 		psectsz = sbuf.st_blksize;
 
+	if (ssopt != 0) {
+		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
+		    ssopt > pssopt) {
+			fprintf(stderr, "Invalid sector size %d/%d\n",
+			    ssopt, pssopt);
+			goto err;
+		}
+
+		/*
+		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
+		 * size be a multiple of the device's sector size.
+		 *
+		 * Validate that the emulated sector size complies with this
+		 * requirement.
+		 */
+		if (S_ISCHR(sbuf.st_mode)) {
+			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
+				fprintf(stderr, "Sector size %d incompatible "
+				    "with underlying device sector size %d\n",
+				    ssopt, sectsz);
+				goto err;
+			}
+		}
+
+		sectsz = ssopt;
+		psectsz = pssopt;
+		psectoff = 0;
+	}
+
 	bc = calloc(1, sizeof(struct blockif_ctxt));
 	if (bc == NULL) {
-		close(fd);
-		return (NULL);
+		perror("calloc");
+		goto err;
 	}
 
 	bc->bc_magic = BLOCKIF_SIG;
@@ -506,6 +544,10 @@ blockif_open(const char *optstr, const char *ident)
 	}
 
 	return (bc);
+err:
+	if (fd >= 0)
+		close(fd);
+	return (NULL);
 }
 
 static int
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index 402b953..929bb3c 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -107,7 +107,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 	uint32_t eax, val;
 	inout_func_t handler;
 	void *arg;
-	int error, retval;
+	int error, fault, retval;
 	enum vm_reg_name idxreg;
 	uint64_t gla, index, iterations, count;
 	struct vm_inout_str *vis;
@@ -163,11 +163,11 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 			}
 
 			error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
-			    bytes, prot, iov, nitems(iov));
-			if (error == -1) {
+			    bytes, prot, iov, nitems(iov), &fault);
+			if (error) {
 				retval = -1;  /* Unrecoverable error */
 				break;
-			} else if (error == 1) {
+			} else if (fault) {
 				retval = 0;  /* Resume guest to handle fault */
 				break;
 			}
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 31e02f8..35a0859 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -2093,7 +2093,7 @@ pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	struct pci_ahci_softc *sc = pi->pi_arg;
 
 	assert(baridx == 5);
-	assert(size == 4);
+	assert((offset % 4) == 0 && size == 4);
 
 	pthread_mutex_lock(&sc->mtx);
 
@@ -2182,24 +2182,29 @@ pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
 
 static uint64_t
 pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
-    uint64_t offset, int size)
+    uint64_t regoff, int size)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
+	uint64_t offset;
 	uint32_t value;
 
 	assert(baridx == 5);
-	assert(size == 4);
+	assert(size == 1 || size == 2 || size == 4);
+	assert((regoff & (size - 1)) == 0);
 
 	pthread_mutex_lock(&sc->mtx);
 
+	offset = regoff & ~0x3;	    /* round down to a multiple of 4 bytes */
 	if (offset < AHCI_OFFSET)
 		value = pci_ahci_host_read(sc, offset);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		value = pci_ahci_port_read(sc, offset);
 	else {
 		value = 0;
-		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset);
+		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n",
+		    regoff);
 	}
+	value >>= 8 * (regoff & 0x3);
 
 	pthread_mutex_unlock(&sc->mtx);
 
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index a08ac4c..03ff0c0 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -1938,7 +1938,7 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
 #define DMEMSZ	4096
 struct pci_emul_dsoftc {
 	uint8_t   ioregs[DIOSZ];
-	uint8_t	  memregs[DMEMSZ];
+	uint8_t	  memregs[2][DMEMSZ];
 };
 
 #define	PCI_EMUL_MSI_MSGS	 4
@@ -1967,6 +1967,9 @@ pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
+	error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ);
+	assert(error == 0);
+
 	return (0);
 }
 
@@ -2006,21 +2009,23 @@ pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 		}
 	}
 
-	if (baridx == 1) {
+	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("diow: memw too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
+		i = baridx - 1;		/* 'memregs' index */
+
 		if (size == 1) {
-			sc->memregs[offset] = value;
+			sc->memregs[i][offset] = value;
 		} else if (size == 2) {
-			*(uint16_t *)&sc->memregs[offset] = value;
+			*(uint16_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 4) {
-			*(uint32_t *)&sc->memregs[offset] = value;
+			*(uint32_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 8) {
-			*(uint64_t *)&sc->memregs[offset] = value;
+			*(uint64_t *)&sc->memregs[i][offset] = value;
 		} else {
 			printf("diow: memw unknown size %d\n", size);
 		}
@@ -2030,7 +2035,7 @@ pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 		 */
 	}
 
-	if (baridx > 1) {
+	if (baridx > 2) {
 		printf("diow: unknown bar idx %d\n", baridx);
 	}
 }
@@ -2041,6 +2046,7 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 {
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 	uint32_t value;
+	int i;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
@@ -2059,29 +2065,31 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
-	
-	if (baridx == 1) {
+
+	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("dior: memr too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
-	
+		
+		i = baridx - 1;		/* 'memregs' index */
+
 		if (size == 1) {
-			value = sc->memregs[offset];
+			value = sc->memregs[i][offset];
 		} else if (size == 2) {
-			value = *(uint16_t *) &sc->memregs[offset];
+			value = *(uint16_t *) &sc->memregs[i][offset];
 		} else if (size == 4) {
-			value = *(uint32_t *) &sc->memregs[offset];
+			value = *(uint32_t *) &sc->memregs[i][offset];
 		} else if (size == 8) {
-			value = *(uint64_t *) &sc->memregs[offset];
+			value = *(uint64_t *) &sc->memregs[i][offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 
-	if (baridx > 1) {
+	if (baridx > 2) {
 		printf("dior: unknown bar idx %d\n", baridx);
 		return (0);
 	}
diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c
index 54a25ae..5c9ea28 100644
--- a/usr.sbin/bhyve/pci_hostbridge.c
+++ b/usr.sbin/bhyve/pci_hostbridge.c
@@ -38,7 +38,7 @@ pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	/* config space */
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275);	/* NetApp */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275);	/* NetApp */
-	pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
+	pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
 
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 35daa1f..8500be6 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -370,6 +370,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
 		blockif_close(sc->bc);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
index 1a029d5..3781ea9 100644
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -640,6 +640,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	/* Link is up if we managed to open tap device. */
 	sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0);
diff --git a/usr.sbin/bhyve/pci_virtio_rnd.c b/usr.sbin/bhyve/pci_virtio_rnd.c
index 92d1d6f..78448f5 100644
--- a/usr.sbin/bhyve/pci_virtio_rnd.c
+++ b/usr.sbin/bhyve/pci_virtio_rnd.c
@@ -170,6 +170,7 @@ pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix()))
 		return (1);
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
index ba6a9d2..69dfaae 100644
--- a/usr.sbin/bhyve/task_switch.c
+++ b/usr.sbin/bhyve/task_switch.c
@@ -202,7 +202,8 @@ desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
  */
 static int
 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint16_t sel, struct user_segment_descriptor *desc, bool doread)
+    uint16_t sel, struct user_segment_descriptor *desc, bool doread,
+    int *faultptr)
 {
 	struct iovec iov[2];
 	uint64_t base;
@@ -215,28 +216,30 @@ desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 	assert(limit >= SEL_LIMIT(sel));
 
 	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
-	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
-	if (error == 0) {
-		if (doread)
-			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
-		else
-			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
-	}
-	return (error);
+	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	if (doread)
+		vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
+	else
+		vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
+	return (0);
 }
 
 static int
 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint16_t sel, struct user_segment_descriptor *desc)
+    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
 {
-	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
 }
 
 static int
 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint16_t sel, struct user_segment_descriptor *desc)
+    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
 {
-	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
 }
 
 /*
@@ -248,7 +251,7 @@ desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
  */
 static int
 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
-    uint16_t sel, struct user_segment_descriptor *desc)
+    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
 {
 	struct vm_guest_paging sup_paging;
 	int error;
@@ -267,7 +270,7 @@ read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
 
 	sup_paging = ts->paging;
 	sup_paging.cpl = 0;		/* implicit supervisor mode */
-	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
 	return (error);
 }
 
@@ -301,14 +304,10 @@ ldt_desc(int sd_type)
 
 /*
  * Validate the descriptor 'seg_desc' associated with 'segment'.
- *
- * Returns 0 on success.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
  */
 static int
 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
-    int segment, struct seg_desc *seg_desc)
+    int segment, struct seg_desc *seg_desc, int *faultptr)
 {
 	struct vm_guest_paging sup_paging;
 	struct user_segment_descriptor usd;
@@ -369,8 +368,8 @@ validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
 	/* Read the descriptor from the GDT/LDT */
 	sup_paging = ts->paging;
 	sup_paging.cpl = 0;	/* implicit supervisor mode */
-	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
-	if (error)
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
+	if (error || *faultptr)
 		return (error);
 
 	/* Verify that the descriptor type is compatible with the segment */
@@ -476,14 +475,10 @@ update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
 
 /*
  * Update the vcpu registers to reflect the state of the new task.
- *
- * Returns 0 on success.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
  */
 static int
 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
-    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
+    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
 {
 	struct seg_desc seg_desc, seg_desc2;
 	uint64_t *pdpte, maxphyaddr, reserved;
@@ -565,8 +560,9 @@ tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
 		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
 
 	/* Validate segment descriptors */
-	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
-	if (error)
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
 
@@ -579,33 +575,40 @@ tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
 	 * VM-entry checks so the guest can handle any exception injected
 	 * during task switch emulation.
 	 */
-	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
-	if (error)
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
 		return (error);
-	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
-	if (error)
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
+	    faultptr);
+	if (error || *faultptr)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
 	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
 
-	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
-	if (error)
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
 
-	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
-	if (error)
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
 
-	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
-	if (error)
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
 
-	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
-	if (error)
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
 
@@ -616,14 +619,10 @@ tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
  * Push an error code on the stack of the new task. This is needed if the
  * task switch was triggered by a hardware exception that causes an error
  * code to be saved (e.g. #PF).
- *
- * Returns 0 on success.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
  */
 static int
 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    int task_type, uint32_t errcode)
+    int task_type, uint32_t errcode, int *faultptr)
 {
 	struct iovec iov[2];
 	struct seg_desc seg_desc;
@@ -632,6 +631,8 @@ push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 	uint32_t esp;
 	uint16_t stacksel;
 
+	*faultptr = 0;
+
 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
 	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
 	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
@@ -666,17 +667,19 @@ push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
 	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
 		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
-		return (1);
+		*faultptr = 1;
+		return (0);
 	}
 
 	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
 		vm_inject_ac(ctx, vcpu, 1);
-		return (1);
+		*faultptr = 1;
+		return (0);
 	}
 
 	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
-	    iov, nitems(iov));
-	if (error)
+	    iov, nitems(iov), faultptr);
+	if (error || *faultptr)
 		return (error);
 
 	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
@@ -687,16 +690,13 @@ push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 /*
  * Evaluate return value from helper functions and potentially return to
  * the VM run loop.
- *  0: success
- * +1: an exception was injected into the guest vcpu
- * -1: unrecoverable/programming error
  */
-#define	CHKERR(x)							\
+#define	CHKERR(error,fault)						\
 	do {								\
-		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\
-		if ((x) == -1)						\
+		assert((error == 0) || (error == EFAULT));		\
+		if (error)						\
 			return (VMEXIT_ABORT);				\
-		else if ((x) == 1)					\
+		else if (fault)						\
 			return (VMEXIT_CONTINUE);			\
 	} while (0)
 
@@ -711,7 +711,7 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	struct iovec nt_iov[2], ot_iov[2];
 	uint64_t cr0, ot_base;
 	uint32_t eip, ot_lim, access;
-	int error, ext, minlimit, nt_type, ot_type, vcpu;
+	int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
 	enum task_switch_reason reason;
 	uint16_t nt_sel, ot_sel;
 
@@ -739,8 +739,9 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	sup_paging.cpl = 0;	/* implicit supervisor mode */
 
 	/* Fetch the new TSS descriptor */
-	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
-	CHKERR(error);
+	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
+	    &fault);
+	CHKERR(error, fault);
 
 	nt = usd_to_seg_desc(&nt_desc);
 
@@ -792,8 +793,8 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 
 	/* Fetch the new TSS */
 	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
-	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
-	CHKERR(error);
+	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
+	CHKERR(error, fault);
 	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
 
 	/* Get the old TSS selector from the guest's task register */
@@ -818,13 +819,14 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
 
 	/* Fetch the old TSS descriptor */
-	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
-	CHKERR(error);
+	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
+	    &fault);
+	CHKERR(error, fault);
 
 	/* Get the old TSS */
 	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
-	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
-	CHKERR(error);
+	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
+	CHKERR(error, fault);
 	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
 
 	/*
@@ -834,8 +836,8 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	if (reason == TSR_IRET || reason == TSR_JMP) {
 		ot_desc.sd_type &= ~0x2;
 		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
-		    &ot_desc);
-		CHKERR(error);
+		    &ot_desc, &fault);
+		CHKERR(error, fault);
 	}
 
 	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
@@ -853,8 +855,8 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	if (reason != TSR_IRET) {
 		nt_desc.sd_type |= 0x2;
 		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
-		    &nt_desc);
-		CHKERR(error);
+		    &nt_desc, &fault);
+		CHKERR(error, fault);
 	}
 
 	/* Update task register to point at the new TSS */
@@ -877,8 +879,9 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	assert(error == 0);
 
 	/* Load processor state from new TSS */
-	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
-	CHKERR(error);
+	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
+	    &fault);
+	CHKERR(error, fault);
 
 	/*
 	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
@@ -889,8 +892,8 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 		assert(task_switch->ext);
 		assert(task_switch->reason == TSR_IDT_GATE);
 		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
-		    task_switch->errcode);
-		CHKERR(error);
+		    task_switch->errcode, &fault);
+		CHKERR(error, fault);
 	}
 
 	/*
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index 41a9e42..11b1e62 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -316,7 +316,7 @@ vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
 			_vq_record(i, vdir, ctx, iov, n_iov, flags);
 			i++;
-		} else if ((vs->vs_negotiated_caps &
+		} else if ((vs->vs_vc->vc_hv_caps &
 		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
 			fprintf(stderr,
 			    "%s: descriptor has forbidden INDIRECT flag, "
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index 223ee25..7d3017f 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -293,6 +293,7 @@ static int get_guest_pat, get_host_pat;
 static int get_guest_sysenter, get_vmcs_link;
 static int get_exit_reason, get_vmcs_exit_qualification;
 static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+static int get_vmcs_exit_inst_length;
 
 static uint64_t desc_base;
 static uint32_t desc_limit, desc_access;
@@ -1145,7 +1146,15 @@ get_misc_vmcs(struct vmctx *ctx, int vcpu)
 				vcpu, u64);
 		}
 	}
-	
+
+	if (!error && (get_vmcs_exit_inst_length || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+		    VMCS_EXIT_INSTRUCTION_LENGTH, &u64);
+		if (error == 0)
+			printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpu,
+			    (uint32_t)u64);
+	}
+
 	if (!error && (get_vmcs_exit_qualification || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
 					  &u64);
@@ -1405,6 +1414,8 @@ setup_options(bool cpu_intel)
 				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
 		{ "get-vmcs-exit-qualification",
 				NO_ARG,	&get_vmcs_exit_qualification, 1 },
+		{ "get-vmcs-exit-inst-length",
+				NO_ARG,	&get_vmcs_exit_inst_length, 1 },
 		{ "get-vmcs-interruptibility",
 				NO_ARG, &get_vmcs_interruptibility, 1 },
 		{ "get-vmcs-exit-interruption-error",
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
index eaf71a8..8ebf116 100644
--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -609,7 +609,7 @@ disk_open(char *path)
 {
 	int err, fd;
 
-	if (ndisks > NDISKS)
+	if (ndisks >= NDISKS)
 		return (ERANGE);
 
 	err = 0;
author	neel <neel@FreeBSD.org>	2015-06-28 03:22:26 +0000
committer	neel <neel@FreeBSD.org>	2015-06-28 03:22:26 +0000
commit	79b96fdbcb5c0722d7886ea1fe175b7b7e4d751f (patch)
tree	8cff46bd40a19bfc6a7b9831b2f3db23c5c757c0
parent	c85aee0195bf01e2e4666927083768c4f182c0c0 (diff)
download	FreeBSD-src-79b96fdbcb5c0722d7886ea1fe175b7b7e4d751f.zip FreeBSD-src-79b96fdbcb5c0722d7886ea1fe175b7b7e4d751f.tar.gz