From 79b96fdbcb5c0722d7886ea1fe175b7b7e4d751f Mon Sep 17 00:00:00 2001
From: neel <neel@FreeBSD.org>
Date: Sun, 28 Jun 2015 03:22:26 +0000
Subject: MFC r282209: Emulate the 'bit test' instruction.

MFC r282259:
Re-implement RTC current time calculation to eliminate the possibility of
losing time.

MFC r282281:
Advertise the MTRR feature via CPUID and emulate the minimal set of MTRR MSRs.

MFC r282284:
When an instruction cannot be decoded just return to userspace so bhyve(8)
can dump the instruction bytes.

MFC r282287:
Don't require <sys/cpuset.h> to be always included before <machine/vmm.h>.

MFC r282296:
Emulate MSR_SYSCFG which is accessed by Linux on AMD cpus when MTRRs are
enabled.

MFC r282301:
Relax limits when transitioning a vector from the IRR to the ISR and also
when extinguishing it from the ISR in response to an EOI.

MFC r282335:
Advertise an additional memory BAR in the "dummy" device emulation.

MFC r282336:
Emulate machine check related MSRs to allow guest OSes like Windows to boot.

MFC r282351:
Don't advertise the Intel SMX capability to the guest.

MFC r282407:
Emulate the 'CMP r/m8, imm8' instruction.

MFC r282519:
Add macros for AMD-specific bits in MSR_EFER: LMSLE, FFXSR and TCE.

MFC r282520:
Emulate guest writes to EFER_MSR properly.

MFC r282558:
Deprecate the 3-way return values from vm_gla2gpa() and vm_copy_setup().

MFC r282571:
Check 'td_owepreempt' and yield the vcpu thread if it is set.

MFC r282595:
Allow byte reads of AHCI registers.

MFC r282784:
Handling indirect descriptors is a capability of the host and not one that
needs to be negotiated. Use the host capabilities field and not the negotiated
field when verifying that indirect descriptors are supported.

MFC r282788:
Allow configuration of the sector size advertised to the guest.

MFC r282865:
Set the subvendor field in config space to the vendor ID. This is required
by the Windows virtio drivers to correctly match a device.

MFC r282922:
Bump the size of the blockif scatter-gather list to 67.

MFC r283075:
Fix off-by-one in array index bounds check. bhyveload would allow you to
create 33 entries on an array that only has 32 slots

MFC r283168:
Temporarily revert r282922 which bumped the max descriptors.

MFC r283255:
Emulate the "CMP r/m, reg" instruction (opcode 39H).

MFC r283256:
Add an option "--get-vmcs-exit-inst-length" to display the instruction length
of the instruction that caused the VM-exit.

MFC r283264:
Change the header type of the emulated host-bridge from type 1 to type 0.

MFC r283293:
Don't rely on the 'VM-exit instruction length' field in the VMCS to always
have an accurate length on an EPT violation.

MFC r283299:
Remove bogus verification of instruction length after instruction decode.

MFC r283308:
Exceptions don't deliver an error code in real mode.

MFC r283657:
Fix non-deterministic delays when accessing a vcpu that was in "running" or
"sleeping" state.

MFC r283973:
Use tunable 'hw.vmm.svm.features' to disable specific SVM features even
though they might be available in hardware. Use tunable 'hw.vmm.svm.num_asids'
to limit the number of ASIDs used by the hypervisor.

MFC r284046:
Fix regression in 'verify_gla()' with the RIP-relative addressing mode.

MFC r284174:
Support guest writes to the TSC by enabling the "use TSC offsetting"
execution control.
---
 sys/amd64/include/vmm.h                  |  49 +++++--
 sys/amd64/include/vmm_instruction_emul.h |  12 +-
 sys/amd64/vmm/amd/amdv.c                 |   1 -
 sys/amd64/vmm/amd/svm.c                  | 125 +++++++++++++---
 sys/amd64/vmm/amd/svm_msr.c              |  31 +++-
 sys/amd64/vmm/amd/vmcb.c                 |   1 -
 sys/amd64/vmm/intel/vmx.c                |  38 ++++-
 sys/amd64/vmm/intel/vmx.h                |   2 +
 sys/amd64/vmm/intel/vmx_msr.c            |  26 +++-
 sys/amd64/vmm/io/vatpic.c                |   1 -
 sys/amd64/vmm/io/vatpit.c                |   1 -
 sys/amd64/vmm/io/vhpet.c                 |   1 -
 sys/amd64/vmm/io/vioapic.c               |   1 -
 sys/amd64/vmm/io/vlapic.c                |  20 ++-
 sys/amd64/vmm/io/vpmtmr.c                |   1 -
 sys/amd64/vmm/io/vrtc.c                  |  54 ++++---
 sys/amd64/vmm/vmm.c                      | 175 ++++++++++++++++-------
 sys/amd64/vmm/vmm_dev.c                  |  14 +-
 sys/amd64/vmm/vmm_instruction_emul.c     | 237 ++++++++++++++++++++-----------
 sys/amd64/vmm/vmm_ioport.c               |   6 -
 sys/amd64/vmm/vmm_stat.c                 |   2 +-
 sys/amd64/vmm/vmm_stat.h                 |   1 +
 sys/amd64/vmm/x86.c                      |  53 +++++--
 sys/amd64/vmm/x86.h                      |  13 ++
 24 files changed, 615 insertions(+), 250 deletions(-)

(limited to 'sys/amd64')

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 52294bd..1a4e5ab 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -120,13 +120,18 @@ struct vm_object;
 struct vm_guest_paging;
 struct pmap;
 
+struct vm_eventinfo {
+	void	*rptr;		/* rendezvous cookie */
+	int	*sptr;		/* suspend cookie */
+	int	*iptr;		/* reqidle cookie */
+};
+
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
-				  struct pmap *pmap, void *rendezvous_cookie,
-				  void *suspend_cookie);
+		    struct pmap *pmap, struct vm_eventinfo *info);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
@@ -204,13 +209,13 @@ int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
-cpuset_t vm_active_cpus(struct vm *vm);
-cpuset_t vm_suspended_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
 
+#ifdef _SYS__CPUSET_H_
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
@@ -228,19 +233,29 @@ void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
+cpuset_t vm_active_cpus(struct vm *vm);
+cpuset_t vm_suspended_cpus(struct vm *vm);
+#endif	/* _SYS__CPUSET_H_ */
 
 static __inline int
-vcpu_rendezvous_pending(void *rendezvous_cookie)
+vcpu_rendezvous_pending(struct vm_eventinfo *info)
 {
 
-	return (*(uintptr_t *)rendezvous_cookie != 0);
+	return (*((uintptr_t *)(info->rptr)) != 0);
 }
 
 static __inline int
-vcpu_suspended(void *suspend_cookie)
+vcpu_suspended(struct vm_eventinfo *info)
 {
 
-	return (*(int *)suspend_cookie);
+	return (*info->sptr);
+}
+
+static __inline int
+vcpu_reqidle(struct vm_eventinfo *info)
+{
+
+	return (*info->iptr);
 }
 
 /*
@@ -274,7 +289,13 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 static int __inline
 vcpu_should_yield(struct vm *vm, int vcpu)
 {
-	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
+
+	if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED))
+		return (1);
+	else if (curthread->td_owepreempt)
+		return (1);
+	else
+		return (0);
 }
 #endif
 
@@ -343,9 +364,10 @@ struct vm_copyinfo {
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
- * Returns 0 on success.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
+ * retval	is_fault	Intepretation
+ *   0		   0		Success
+ *   0		   1		An exception was injected into the guest
+ * EFAULT	  N/A		Unrecoverable error
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
@@ -353,7 +375,7 @@ struct vm_copyinfo {
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
-    int num_copyinfo);
+    int num_copyinfo, int *is_fault);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
@@ -497,6 +519,7 @@ enum vm_exitcode {
 	VM_EXITCODE_MONITOR,
 	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_SVM,
+	VM_EXITCODE_REQIDLE,
 	VM_EXITCODE_MAX
 };
 
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index 651b3b3..5e7127f 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -81,17 +81,19 @@ int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
  */
 int vmm_fetch_instruction(struct vm *vm, int cpuid,
 			  struct vm_guest_paging *guest_paging,
-			  uint64_t rip, int inst_length, struct vie *vie);
+			  uint64_t rip, int inst_length, struct vie *vie,
+			  int *is_fault);
 
 /*
  * Translate the guest linear address 'gla' to a guest physical address.
  *
- * Returns 0 on success and '*gpa' contains the result of the translation.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
+ * retval	is_fault	Interpretation
+ *   0		   0		'gpa' contains result of the translation
+ *   0		   1		An exception was injected into the guest
+ * EFAULT	  N/A		An unrecoverable hypervisor error occurred
  */
 int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa);
+    uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
 
 void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
 
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index acb3a3d..3157e21 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/smp.h>
 
 #include <machine/vmm.h>
 #include "io/iommu.h"
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index 7cc13ca..b25d69d 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -102,8 +102,8 @@ static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 /* Per-CPU context area. */
 extern struct pcpu __pcpu[];
 
-static uint32_t svm_feature;	/* AMD SVM features. */
-SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0,
+static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
     "SVM features advertised by CPUID.8000000AH:EDX");
 
 static int disable_npf_assist;
@@ -112,7 +112,7 @@ SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
 
 /* Maximum ASIDs supported by the processor */
 static uint32_t nasid;
-SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0,
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
     "Number of ASIDs supported by this processor");
 
 /* Current ASID generation for each host cpu */
@@ -174,9 +174,14 @@ check_svm_features(void)
 
 	/* CPUID Fn8000_000A is for SVM */
 	do_cpuid(0x8000000A, regs);
-	svm_feature = regs[3];
+	svm_feature &= regs[3];
 
-	nasid = regs[1];
+	/*
+	 * The number of ASIDs can be configured to be less than what is
+	 * supported by the hardware but not more.
+	 */
+	if (nasid == 0 || nasid > regs[1])
+		nasid = regs[1];
 	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
 
 	/* bhyve requires the Nested Paging feature */
@@ -564,6 +569,19 @@ svm_vminit(struct vm *vm, pmap_t pmap)
 	return (svm_sc);
 }
 
+/*
+ * Collateral for a generic SVM VM-exit.
+ */
+static void
+vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
+{
+
+	vme->exitcode = VM_EXITCODE_SVM;
+	vme->u.svm.exitcode = code;
+	vme->u.svm.exitinfo1 = info1;
+	vme->u.svm.exitinfo2 = info2;
+}
+
 static int
 svm_cpl(struct vmcb_state *state)
 {
@@ -1080,6 +1098,76 @@ clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
 }
 
+#define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
+
+static int
+svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
+{
+	struct vm_exit *vme;
+	struct vmcb_state *state;
+	uint64_t changed, lma, oldval;
+	int error;
+
+	state = svm_get_vmcb_state(sc, vcpu);
+
+	oldval = state->efer;
+	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
+
+	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
+	changed = oldval ^ newval;
+
+	if (newval & EFER_MBZ_BITS)
+		goto gpf;
+
+	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
+	if (changed & EFER_LME) {
+		if (state->cr0 & CR0_PG)
+			goto gpf;
+	}
+
+	/* EFER.LMA = EFER.LME & CR0.PG */
+	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
+		lma = EFER_LMA;
+	else
+		lma = 0;
+
+	if ((newval & EFER_LMA) != lma)
+		goto gpf;
+
+	if (newval & EFER_NXE) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
+			goto gpf;
+	}
+
+	/*
+	 * XXX bhyve does not enforce segment limits in 64-bit mode. Until
+	 * this is fixed flag guest attempt to set EFER_LMSLE as an error.
+	 */
+	if (newval & EFER_LMSLE) {
+		vme = vm_exitinfo(sc->vm, vcpu);
+		vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
+		*retu = true;
+		return (0);
+	}
+
+	if (newval & EFER_FFXSR) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
+			goto gpf;
+	}
+
+	if (newval & EFER_TCE) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
+			goto gpf;
+	}
+
+	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
+	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
+	return (0);
+gpf:
+	vm_inject_gp(sc->vm, vcpu);
+	return (0);
+}
+
 static int
 emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
     bool *retu)
@@ -1089,7 +1177,7 @@ emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
 	if (lapic_msr(num))
 		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
 	else if (num == MSR_EFER)
-		error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, val);
+		error = svm_write_efer(sc, vcpu, val, retu);
 	else
 		error = svm_wrmsr(sc, vcpu, num, val, retu);
 
@@ -1189,19 +1277,6 @@ nrip_valid(uint64_t exitcode)
 	}
 }
 
-/*
- * Collateral for a generic SVM VM-exit.
- */
-static void
-vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
-{
-
-	vme->exitcode = VM_EXITCODE_SVM;
-	vme->u.svm.exitcode = code;
-	vme->u.svm.exitinfo1 = info1;
-	vme->u.svm.exitinfo2 = info2;
-}
-
 static int
 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
@@ -1830,7 +1905,7 @@ enable_gintr(void)
  */
 static int
 svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
-	void *rend_cookie, void *suspended_cookie)
+	struct vm_eventinfo *evinfo)
 {
 	struct svm_regctx *gctx;
 	struct svm_softc *svm_sc;
@@ -1905,18 +1980,24 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		 */
 		disable_gintr();
 
-		if (vcpu_suspended(suspended_cookie)) {
+		if (vcpu_suspended(evinfo)) {
 			enable_gintr();
 			vm_exit_suspended(vm, vcpu, state->rip);
 			break;
 		}
 
-		if (vcpu_rendezvous_pending(rend_cookie)) {
+		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_gintr();
 			vm_exit_rendezvous(vm, vcpu, state->rip);
 			break;
 		}
 
+		if (vcpu_reqidle(evinfo)) {
+			enable_gintr();
+			vm_exit_reqidle(vm, vcpu, state->rip);
+			break;
+		}
+
 		/* We are asked to give the cpu by scheduler. */
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_gintr();
diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c
index 100af4b..088751a 100644
--- a/sys/amd64/vmm/amd/svm_msr.c
+++ b/sys/amd64/vmm/amd/svm_msr.c
@@ -27,12 +27,17 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/errno.h>
+#include <sys/systm.h>
 
 #include <machine/cpufunc.h>
 #include <machine/specialreg.h>
+#include <machine/vmm.h>
 
+#include "svm.h"
+#include "vmcb.h"
+#include "svm_softc.h"
 #include "svm_msr.h"
 
 #ifndef MSR_AMDK8_IPM
@@ -105,6 +110,18 @@ svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
 	int error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		*result = 0;
+		break;
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+		*result = 0;
+		break;
 	case MSR_AMDK8_IPM:
 		*result = 0;
 		break;
@@ -122,6 +139,18 @@ svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
 	int error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		break;		/* ignore writes */
+	case MSR_MTRRcap:
+		vm_inject_gp(sc->vm, vcpu);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+		break;		/* Ignore writes */
 	case MSR_AMDK8_IPM:
 		/*
 		 * Ignore writes to the "Interrupt Pending Message" MSR.
diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c
index fb4b2c8..d860169 100644
--- a/sys/amd64/vmm/amd/vmcb.c
+++ b/sys/amd64/vmm/amd/vmcb.c
@@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <machine/segments.h>
 #include <machine/specialreg.h>
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 9aa55e2..5bbfe99 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -857,10 +857,11 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
-	 * The TSC MSR is exposed read-only. Writes are disallowed as that
-	 * will impact the host TSC.
-	 * XXX Writes would be implemented with a wrmsr trap, and
-	 * then modifying the TSC offset in the VMCS.
+	 * The TSC MSR is exposed read-only. Writes are disallowed as
+	 * that will impact the host TSC.  If the guest does a write
+	 * the "use TSC offsetting" execution control is enabled and the
+	 * difference between the host TSC and the guest TSC is written
+	 * into the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
@@ -1131,6 +1132,22 @@ vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
+int
+vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
+{
+	int error;
+
+	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
+		vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
+		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+		VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
+	}
+
+	error = vmwrite(VMCS_TSC_OFFSET, offset);
+
+	return (error);
+}
+
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
@@ -1781,6 +1798,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+	vmexit->inst_length = 0;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
@@ -2554,7 +2572,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 
 static int
 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
-    void *rendezvous_cookie, void *suspend_cookie)
+    struct vm_eventinfo *evinfo)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
@@ -2623,18 +2641,24 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
-		if (vcpu_suspended(suspend_cookie)) {
+		if (vcpu_suspended(evinfo)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
-		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
+		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
+		if (vcpu_reqidle(evinfo)) {
+			enable_intr();
+			vm_exit_reqidle(vmx->vm, vcpu, rip);
+			break;
+		}
+
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, rip);
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index bc48861..57f5b28 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -135,6 +135,8 @@ void	vmx_call_isr(uintptr_t entry);
 u_long	vmx_fix_cr0(u_long cr0);
 u_long	vmx_fix_cr4(u_long cr4);
 
+int	vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset);
+
 extern char	vmx_exit_guest[];
 
 #endif
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index e517778..91b2c01 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
@@ -396,6 +395,17 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 	error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		*val = 0;
+		break;
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		*val = 0;
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		*val = misc_enable;
 		break;
@@ -427,6 +437,17 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 	error = 0;
 
 	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		break;		/* ignore writes */
+	case MSR_MTRRcap:
+		vm_inject_gp(vmx->vm, vcpuid);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		break;		/* Ignore writes */
 	case MSR_IA32_MISC_ENABLE:
 		changed = val ^ misc_enable;
 		/*
@@ -453,6 +474,9 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 		else
 			vm_inject_gp(vmx->vm, vcpuid);
 		break;
+	case MSR_TSC:
+		error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc());
+		break;
 	default:
 		error = EINVAL;
 		break;
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index 0df6e7c..6e94f5b 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c
index 842253d..173ef1f 100644
--- a/sys/amd64/vmm/io/vatpit.c
+++ b/sys/amd64/vmm/io/vatpit.c
@@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index a4c96cd..1db1c51 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -36,7 +36,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <dev/acpica/acpi_hpet.h>
 
diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c
index 411887d..e6b8b5a 100644
--- a/sys/amd64/vmm/io/vioapic.c
+++ b/sys/amd64/vmm/io/vioapic.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index b192735..039491f 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -548,6 +548,8 @@ vlapic_update_ppr(struct vlapic *vlapic)
 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
 }
 
+static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
+
 static void
 vlapic_process_eoi(struct vlapic *vlapic)
 {
@@ -558,11 +560,7 @@ vlapic_process_eoi(struct vlapic *vlapic)
 	isrptr = &lapic->isr0;
 	tmrptr = &lapic->tmr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		bitpos = fls(isrptr[idx]);
 		if (bitpos-- != 0) {
@@ -571,17 +569,21 @@ vlapic_process_eoi(struct vlapic *vlapic)
 				      vlapic->isrvec_stk_top);
 			}
 			isrptr[idx] &= ~(1 << bitpos);
+			vector = i * 32 + bitpos;
+			VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d",
+			    vector);
 			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
 			vlapic->isrvec_stk_top--;
 			vlapic_update_ppr(vlapic);
 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
-				vector = i * 32 + bitpos;
 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
 				    vector);
 			}
 			return;
 		}
 	}
+	VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI");
+	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
 }
 
 static __inline int
@@ -1093,11 +1095,7 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
 
 	irrptr = &lapic->irr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		val = atomic_load_acq_int(&irrptr[idx]);
 		bitpos = fls(val);
diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c
index 09f763f..1e7bb93 100644
--- a/sys/amd64/vmm/io/vpmtmr.c
+++ b/sys/amd64/vmm/io/vpmtmr.c
@@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c
index 9d406c1..18ebc4b 100644
--- a/sys/amd64/vmm/io/vrtc.c
+++ b/sys/amd64/vmm/io/vrtc.c
@@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
@@ -142,20 +141,23 @@ update_enabled(struct vrtc *vrtc)
 }
 
 static time_t
-vrtc_curtime(struct vrtc *vrtc)
+vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime)
 {
 	sbintime_t now, delta;
-	time_t t;
+	time_t t, secs;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	t = vrtc->base_rtctime;
+	*basetime = vrtc->base_uptime;
 	if (update_enabled(vrtc)) {
 		now = sbinuptime();
 		delta = now - vrtc->base_uptime;
 		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
 		    "%#lx to %#lx", vrtc->base_uptime, now));
-		t += delta / SBT_1S;
+		secs = delta / SBT_1S;
+		t += secs;
+		*basetime += secs * SBT_1S;
 	}
 	return (t);
 }
@@ -390,9 +392,10 @@ fail:
 }
 
 static int
-vrtc_time_update(struct vrtc *vrtc, time_t newtime)
+vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase)
 {
 	struct rtcdev *rtc;
+	sbintime_t oldbase;
 	time_t oldtime;
 	uint8_t alarm_sec, alarm_min, alarm_hour;
 
@@ -404,16 +407,21 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime)
 	alarm_hour = rtc->alarm_hour;
 
 	oldtime = vrtc->base_rtctime;
-	VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx",
+	VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx",
 	    oldtime, newtime);
 
+	oldbase = vrtc->base_uptime;
+	VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx",
+	    oldbase, newbase);
+	vrtc->base_uptime = newbase;
+
 	if (newtime == oldtime)
 		return (0);
 
 	/*
 	 * If 'newtime' indicates that RTC updates are disabled then just
 	 * record that and return. There is no need to do alarm interrupt
-	 * processing or update 'base_uptime' in this case.
+	 * processing in this case.
 	 */
 	if (newtime == VRTC_BROKEN_TIME) {
 		vrtc->base_rtctime = VRTC_BROKEN_TIME;
@@ -459,8 +467,6 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime)
 	if (uintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
 
-	vrtc->base_uptime = sbinuptime();
-
 	return (0);
 }
 
@@ -531,7 +537,7 @@ static void
 vrtc_callout_handler(void *arg)
 {
 	struct vrtc *vrtc = arg;
-	sbintime_t freqsbt;
+	sbintime_t freqsbt, basetime;
 	time_t rtctime;
 	int error;
 
@@ -553,8 +559,8 @@ vrtc_callout_handler(void *arg)
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
 
 	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
-		rtctime = vrtc_curtime(vrtc);
-		error = vrtc_time_update(vrtc, rtctime);
+		rtctime = vrtc_curtime(vrtc, &basetime);
+		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
 		    __func__, error));
 	}
@@ -619,7 +625,7 @@ static int
 vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
-	sbintime_t oldfreq, newfreq;
+	sbintime_t oldfreq, newfreq, basetime;
 	time_t curtime, rtctime;
 	int error;
 	uint8_t oldval, changed;
@@ -640,12 +646,13 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 	if (changed & RTCSB_HALT) {
 		if ((newval & RTCSB_HALT) == 0) {
 			rtctime = rtc_to_secs(vrtc);
+			basetime = sbinuptime();
 			if (rtctime == VRTC_BROKEN_TIME) {
 				if (rtc_flag_broken_time)
 					return (-1);
 			}
 		} else {
-			curtime = vrtc_curtime(vrtc);
+			curtime = vrtc_curtime(vrtc, &basetime);
 			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
 			    "between vrtc basetime (%#lx) and curtime (%#lx)",
 			    __func__, vrtc->base_rtctime, curtime));
@@ -664,7 +671,7 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 			rtctime = VRTC_BROKEN_TIME;
 			rtc->reg_b &= ~RTCSB_UINTR;
 		}
-		error = vrtc_time_update(vrtc, rtctime);
+		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("vrtc_time_update error %d", error));
 	}
 
@@ -744,7 +751,7 @@ vrtc_set_time(struct vm *vm, time_t secs)
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
-	error = vrtc_time_update(vrtc, secs);
+	error = vrtc_time_update(vrtc, secs, sbinuptime());
 	VRTC_UNLOCK(vrtc);
 
 	if (error) {
@@ -761,11 +768,12 @@ time_t
 vrtc_get_time(struct vm *vm)
 {
 	struct vrtc *vrtc;
+	sbintime_t basetime;
 	time_t t;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
-	t = vrtc_curtime(vrtc);
+	t = vrtc_curtime(vrtc, &basetime);
 	VRTC_UNLOCK(vrtc);
 
 	return (t);
@@ -802,6 +810,7 @@ int
 vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 {
 	struct vrtc *vrtc;
+	sbintime_t basetime;
 	time_t curtime;
 	uint8_t *ptr;
 
@@ -818,7 +827,7 @@ vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 	 * Update RTC date/time fields if necessary.
 	 */
 	if (offset < 10 || offset == RTC_CENTURY) {
-		curtime = vrtc_curtime(vrtc);
+		curtime = vrtc_curtime(vrtc, &basetime);
 		secs_to_rtc(curtime, vrtc, 0);
 	}
 
@@ -858,6 +867,7 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
+	sbintime_t basetime;
 	time_t curtime;
 	int error, offset;
 
@@ -875,8 +885,8 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 	}
 
 	error = 0;
-	curtime = vrtc_curtime(vrtc);
-	vrtc_time_update(vrtc, curtime);
+	curtime = vrtc_curtime(vrtc, &basetime);
+	vrtc_time_update(vrtc, curtime, basetime);
 
 	/*
 	 * Update RTC date/time fields if necessary.
@@ -939,7 +949,7 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 		 */
 		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
 			curtime = rtc_to_secs(vrtc);
-			error = vrtc_time_update(vrtc, curtime);
+			error = vrtc_time_update(vrtc, curtime, sbinuptime());
 			KASSERT(!error, ("vrtc_time_update error %d", error));
 			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
 				error = -1;
@@ -993,7 +1003,7 @@ vrtc_init(struct vm *vm)
 
 	VRTC_LOCK(vrtc);
 	vrtc->base_rtctime = VRTC_BROKEN_TIME;
-	vrtc_time_update(vrtc, curtime);
+	vrtc_time_update(vrtc, curtime, sbinuptime());
 	secs_to_rtc(curtime, vrtc, 0);
 	VRTC_UNLOCK(vrtc);
 
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 0e78272..c3c7bb1 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -96,6 +96,7 @@ struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
+	int		reqidle;	/* (i) request vcpu to idle */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
@@ -165,8 +166,8 @@ static struct vmm_ops *ops;
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
-#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
-	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
+#define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
@@ -223,6 +224,28 @@ TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu);
 SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0,
     "Force use of I/O MMU even if no passthrough devices were found.");
 
+static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
+
+#ifdef KTR
+static const char *
+vcpu_state2str(enum vcpu_state state)
+{
+
+	switch (state) {
+	case VCPU_IDLE:
+		return ("idle");
+	case VCPU_FROZEN:
+		return ("frozen");
+	case VCPU_RUNNING:
+		return ("running");
+	case VCPU_SLEEPING:
+		return ("sleeping");
+	default:
+		return ("unknown");
+	}
+}
+#endif
+
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
@@ -257,6 +280,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
@@ -982,11 +1006,13 @@ save_guest_fpustate(struct vcpu *vcpu)
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
-vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
+vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
+	struct vcpu *vcpu;
 	int error;
 
+	vcpu = &vm->vcpu[vcpuid];
 	vcpu_assert_locked(vcpu);
 
 	/*
@@ -995,8 +1021,13 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
-		while (vcpu->state != VCPU_IDLE)
+		while (vcpu->state != VCPU_IDLE) {
+			vcpu->reqidle = 1;
+			vcpu_notify_event_locked(vcpu, false);
+			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
+			    "idle requested", vcpu_state2str(vcpu->state));
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
@@ -1033,6 +1064,9 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	if (error)
 		return (EBUSY);
 
+	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
+	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
+
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
@@ -1055,11 +1089,11 @@ vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 }
 
 static void
-vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
-	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
+	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
@@ -1147,7 +1181,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
-		if (vm->rendezvous_func != NULL || vm->suspend)
+		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
@@ -1184,13 +1218,13 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 		}
 
 		t = ticks;
-		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
-		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
 
@@ -1258,11 +1292,14 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
-	int cs_d, error, length;
+	int cs_d, error, fault;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
+	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
+	    __func__, vme->inst_length));
+
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_base = vme->u.inst_emul.cs_base;
@@ -1275,37 +1312,31 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
-		/*
-		 * If the instruction length is not known then assume a
-		 * maximum size instruction.
-		 */
-		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
-		    cs_base, length, vie);
+		    cs_base, VIE_INST_SIZE, vie, &fault);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
-		error = 0;
+		error = fault = 0;
 	}
-	if (error == 1)
-		return (0);		/* Resume guest to handle page fault */
-	else if (error == -1)
-		return (EFAULT);
-	else if (error != 0)
-		panic("%s: vmm_fetch_instruction error %d", __func__, error);
+	if (error || fault)
+		return (error);
 
-	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
-		return (EFAULT);
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
+		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
+		    vme->rip + cs_base);
+		*retu = true;	    /* dump instruction bytes in userspace */
+		return (0);
+	}
 
 	/*
-	 * If the instruction length was not specified then update it now
-	 * along with 'nextrip'.
+	 * Update 'nextrip' based on the length of the emulated instruction.
 	 */
-	if (vme->inst_length == 0) {
-		vme->inst_length = vie->num_processed;
-		vcpu->nextrip += vie->num_processed;
-	}
+	vme->inst_length = vie->num_processed;
+	vcpu->nextrip += vie->num_processed;
+	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
+	    "decoding", vcpu->nextrip);
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
@@ -1355,9 +1386,9 @@ vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
-			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
-			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
@@ -1380,6 +1411,19 @@ vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 	return (0);
 }
 
+static int
+vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
+	vcpu->reqidle = 0;
+	vcpu_unlock(vcpu);
+	*retu = true;
+	return (0);
+}
+
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
@@ -1437,6 +1481,18 @@ vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 }
 
 void
+vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	struct vm_exit *vmexit;
+
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->rip = rip;
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_REQIDLE;
+	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
+}
+
+void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
@@ -1451,6 +1507,7 @@ vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
+	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
@@ -1458,7 +1515,6 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
-	void *rptr, *sptr;
 
 	vcpuid = vmrun->cpuid;
 
@@ -1471,11 +1527,12 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
-	rptr = &vm->rendezvous_func;
-	sptr = &vm->suspend;
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
+	evinfo.rptr = &vm->rendezvous_func;
+	evinfo.sptr = &vm->suspend;
+	evinfo.iptr = &vcpu->reqidle;
 restart:
 	critical_enter();
 
@@ -1490,7 +1547,7 @@ restart:
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
-	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr);
+	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
@@ -1503,6 +1560,9 @@ restart:
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
+		case VM_EXITCODE_REQIDLE:
+			error = vm_handle_reqidle(vm, vcpuid, &retu);
+			break;
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
@@ -1541,6 +1601,8 @@ restart:
 	if (error == 0 && retu == false)
 		goto restart;
 
+	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
+
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
@@ -1790,6 +1852,7 @@ vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
+	uint64_t regval;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
@@ -1814,6 +1877,16 @@ vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
 		return (EBUSY);
 	}
 
+	if (errcode_valid) {
+		/*
+		 * Exceptions don't deliver an error code in real mode.
+		 */
+		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
+		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
+		if (!(regval & CR0_PE))
+			errcode_valid = 0;
+	}
+
 	/*
 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
 	 *
@@ -2066,7 +2139,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
-	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
+	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
@@ -2162,15 +2235,11 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
-void
-vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+static void
+vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 {
 	int hostcpu;
-	struct vcpu *vcpu;
 
-	vcpu = &vm->vcpu[vcpuid];
-
-	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
@@ -2195,6 +2264,15 @@ vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
+}
+
+void
+vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	vcpu_notify_event_locked(vcpu, lapic_intr);
 	vcpu_unlock(vcpu);
 }
 
@@ -2321,7 +2399,7 @@ vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
-    int num_copyinfo)
+    int num_copyinfo, int *fault)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
@@ -2334,8 +2412,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
-		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
-		if (error)
+		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
+		if (error || *fault)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
@@ -2357,8 +2435,9 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
-		return (-1);
+		return (EFAULT);
 	} else {
+		*fault = 0;
 		return (0);
 	}
 }
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 5be99cb..e3e140a 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -441,19 +441,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
 		gg = (struct vm_gla2gpa *)data;
 		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
-		    gg->prot, &gg->gpa);
-		KASSERT(error == 0 || error == 1 || error == -1,
+		    gg->prot, &gg->gpa, &gg->fault);
+		KASSERT(error == 0 || error == EFAULT,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
-		if (error >= 0) {
-			/*
-			 * error = 0: the translation was successful
-			 * error = 1: a fault was injected into the guest
-			 */
-			gg->fault = error;
-			error = 0;
-		} else {
-			error = EFAULT;
-		}
 		break;
 	}
 	case VM_ACTIVATE_CPU:
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 6f75515..758b7e8 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -73,6 +73,7 @@ enum {
 	VIE_OP_TYPE_MOVS,
 	VIE_OP_TYPE_GROUP1,
 	VIE_OP_TYPE_STOS,
+	VIE_OP_TYPE_BITTEST,
 	VIE_OP_TYPE_LAST
 };
 
@@ -92,6 +93,11 @@ static const struct vie_op two_byte_opcodes[256] = {
 		.op_byte = 0xB7,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xBA] = {
+		.op_byte = 0xBA,
+		.op_type = VIE_OP_TYPE_BITTEST,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -107,6 +113,10 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x2B,
 		.op_type = VIE_OP_TYPE_SUB,
 	},
+	[0x39] = {
+		.op_byte = 0x39,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
 	[0x3B] = {
 		.op_byte = 0x3B,
 		.op_type = VIE_OP_TYPE_CMP,
@@ -172,14 +182,20 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x23,
 		.op_type = VIE_OP_TYPE_AND,
 	},
+	[0x80] = {
+		/* Group 1 extended opcode */
+		.op_byte = 0x80,
+		.op_type = VIE_OP_TYPE_GROUP1,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0x81] = {
-		/* XXX Group 1 extended opcode */
+		/* Group 1 extended opcode */
 		.op_byte = 0x81,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x83] = {
-		/* XXX Group 1 extended opcode */
+		/* Group 1 extended opcode */
 		.op_byte = 0x83,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM8,
@@ -585,13 +601,11 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 /*
  * Helper function to calculate and validate a linear address.
- *
- * Returns 0 on success and 1 if an exception was injected into the guest.
  */
 static int
 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
     int opsize, int addrsize, int prot, enum vm_reg_name seg,
-    enum vm_reg_name gpr, uint64_t *gla)
+    enum vm_reg_name gpr, uint64_t *gla, int *fault)
 {
 	struct seg_desc desc;
 	uint64_t cr0, val, rflags;
@@ -617,7 +631,7 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
-		return (1);
+		goto guest_fault;
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
@@ -625,14 +639,19 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
-		return (1);
+		goto guest_fault;
 	}
 
 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
-		return (1);
+		goto guest_fault;
 	}
 
+	*fault = 0;
+	return (0);
+
+guest_fault:
+	*fault = 1;
 	return (0);
 }
 
@@ -648,7 +667,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 #endif
 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
 	uint64_t rcx, rdi, rsi, rflags;
-	int error, opsize, seg, repeat;
+	int error, fault, opsize, seg, repeat;
 
 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
 	val = 0;
@@ -671,8 +690,10 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * The count register is %rcx, %ecx or %cx depending on the
 		 * address size of the instruction.
 		 */
-		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
-			return (0);
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
+			error = 0;
+			goto done;
+		}
 	}
 
 	/*
@@ -693,13 +714,16 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
-	if (error)
+	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
+	if (error || fault)
 		goto done;
 
 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
-	    copyinfo, nitems(copyinfo));
+	    copyinfo, nitems(copyinfo), &fault);
 	if (error == 0) {
+		if (fault)
+			goto done;	/* Resume guest to handle fault */
+
 		/*
 		 * case (2): read from system memory and write to mmio.
 		 */
@@ -708,11 +732,6 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
 		if (error)
 			goto done;
-	} else if (error > 0) {
-		/*
-		 * Resume guest execution to handle fault.
-		 */
-		goto done;
 	} else {
 		/*
 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
@@ -720,13 +739,17 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 */
 
 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
-		if (error)
+		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
+		    &fault);
+		if (error || fault)
 			goto done;
 
 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
-		    PROT_WRITE, copyinfo, nitems(copyinfo));
+		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
 		if (error == 0) {
+			if (fault)
+				goto done;    /* Resume guest to handle fault */
+
 			/*
 			 * case (3): read from MMIO and write to system memory.
 			 *
@@ -742,27 +765,29 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-		} else if (error > 0) {
-			/*
-			 * Resume guest execution to handle fault.
-			 */
-			goto done;
 		} else {
 			/*
 			 * Case (4): read from and write to mmio.
+			 *
+			 * Commit to the MMIO read/write (with potential
+			 * side-effects) only after we are sure that the
+			 * instruction is not going to be restarted due
+			 * to address translation faults.
 			 */
 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
-			    PROT_READ, &srcgpa);
-			if (error)
-				goto done;
-			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
-			if (error)
+			    PROT_READ, &srcgpa, &fault);
+			if (error || fault)
 				goto done;
 
 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
-			   PROT_WRITE, &dstgpa);
+			   PROT_WRITE, &dstgpa, &fault);
+			if (error || fault)
+				goto done;
+
+			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
 			if (error)
 				goto done;
+
 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
 			if (error)
 				goto done;
@@ -807,10 +832,9 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 			vm_restart_instruction(vm, vcpuid);
 	}
 done:
-	if (error < 0)
-		return (EFAULT);
-	else
-		return (0);
+	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
+	    __func__, error));
+	return (error);
 }
 
 static int
@@ -1030,39 +1054,55 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
-	uint64_t op1, op2, rflags, rflags2;
+	uint64_t regop, memop, op1, op2, rflags, rflags2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	switch (vie->op.op_byte) {
+	case 0x39:
 	case 0x3B:
 		/*
+		 * 39/r		CMP r/m16, r16
+		 * 39/r		CMP r/m32, r32
+		 * REX.W 39/r	CMP r/m64, r64
+		 *
 		 * 3B/r		CMP r16, r/m16
 		 * 3B/r		CMP r32, r/m32
 		 * REX.W + 3B/r	CMP r64, r/m64
 		 *
-		 * Compare first operand (reg) with second operand (r/m) and
+		 * Compare the first operand with the second operand and
 		 * set status flags in EFLAGS register. The comparison is
 		 * performed by subtracting the second operand from the first
 		 * operand and then setting the status flags.
 		 */
 
-		/* Get the first operand */
+		/* Get the register operand */
 		reg = gpr_map[vie->reg];
-		error = vie_read_register(vm, vcpuid, reg, &op1);
+		error = vie_read_register(vm, vcpuid, reg, &regop);
 		if (error)
 			return (error);
 
-		/* Get the second operand */
-		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		/* Get the memory operand */
+		error = memread(vm, vcpuid, gpa, &memop, size, arg);
 		if (error)
 			return (error);
 
+		if (vie->op.op_byte == 0x3B) {
+			op1 = regop;
+			op2 = memop;
+		} else {
+			op1 = memop;
+			op2 = regop;
+		}
 		rflags2 = getcc(size, op1, op2);
 		break;
+	case 0x80:
 	case 0x81:
 	case 0x83:
 		/*
+		 * 80 /7		cmp r/m8, imm8
+		 * REX + 80 /7		cmp r/m8, imm8
+		 *
 		 * 81 /7		cmp r/m16, imm16
 		 * 81 /7		cmp r/m32, imm32
 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
@@ -1078,6 +1118,8 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * the status flags.
 		 *
 		 */
+		if (vie->op.op_byte == 0x80)
+			size = 1;
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
@@ -1167,7 +1209,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 #endif
 	struct seg_desc ss_desc;
 	uint64_t cr0, rflags, rsp, stack_gla, val;
-	int error, size, stackaddrsize, pushop;
+	int error, fault, size, stackaddrsize, pushop;
 
 	val = 0;
 	size = vie->opsize;
@@ -1233,18 +1275,10 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 	}
 
 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
-	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
-	if (error == -1) {
-		/*
-		 * XXX cannot return a negative error value here because it
-		 * ends up being the return value of the VM_RUN() ioctl and
-		 * is interpreted as a pseudo-error (for e.g. ERESTART).
-		 */
-		return (EFAULT);
-	} else if (error == 1) {
-		/* Resume guest execution to handle page fault */
-		return (0);
-	}
+	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
+	    &fault);
+	if (error || fault)
+		return (error);
 
 	if (pushop) {
 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
@@ -1335,6 +1369,48 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+static int
+emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
+{
+	uint64_t val, rflags;
+	int error, bitmask, bitoff;
+
+	/*
+	 * 0F BA is a Group 8 extended opcode.
+	 *
+	 * Currently we only emulate the 'Bit Test' instruction which is
+	 * identified by a ModR/M:reg encoding of 100b.
+	 */
+	if ((vie->reg & 7) != 4)
+		return (EINVAL);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
+	if (error)
+		return (error);
+
+	/*
+	 * Intel SDM, Vol 2, Table 3-2:
+	 * "Range of Bit Positions Specified by Bit Offset Operands"
+	 */
+	bitmask = vie->opsize * 8 - 1;
+	bitoff = vie->immediate & bitmask;
+
+	/* Copy the bit into the Carry flag in %rflags */
+	if (val & (1UL << bitoff))
+		rflags |= PSL_C;
+	else
+		rflags &= ~PSL_C;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
+
+	return (0);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
@@ -1391,6 +1467,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_sub(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_BITTEST:
+		error = emulate_bittest(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
 	default:
 		error = EINVAL;
 		break;
@@ -1608,7 +1688,7 @@ ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
 
 int
 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa)
+    uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
 {
 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
 	u_int retries;
@@ -1616,6 +1696,8 @@ vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 	uint32_t *ptpbase32, pte32;
 	void *cookie;
 
+	*guest_fault = 0;
+
 	usermode = (paging->cpl == 3 ? 1 : 0);
 	writable = prot & VM_PROT_WRITE;
 	cookie = NULL;
@@ -1778,18 +1860,20 @@ restart:
 	*gpa = pte | (gla & (pgsize - 1));
 done:
 	ptp_release(&cookie);
+	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
+	    __func__, retval));
 	return (retval);
 error:
-	retval = -1;
+	retval = EFAULT;
 	goto done;
 fault:
-	retval = 1;
+	*guest_fault = 1;
 	goto done;
 }
 
 int
 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t rip, int inst_length, struct vie *vie)
+    uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
 {
 	struct vm_copyinfo copyinfo[2];
 	int error, prot;
@@ -1799,13 +1883,14 @@ vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 
 	prot = PROT_READ | PROT_EXEC;
 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
-	    copyinfo, nitems(copyinfo));
-	if (error == 0) {
-		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
-		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-		vie->num_valid = inst_length;
-	}
-	return (error);
+	    copyinfo, nitems(copyinfo), faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+	vie->num_valid = inst_length;
+	return (0);
 }
 
 static int
@@ -2230,19 +2315,6 @@ decode_moffset(struct vie *vie)
 }
 
 /*
- * Verify that all the bytes in the instruction buffer were consumed.
- */
-static int
-verify_inst_length(struct vie *vie)
-{
-
-	if (vie->num_processed)
-		return (0);
-	else
-		return (-1);
-}
-
-/*
  * Verify that the 'guest linear address' provided as collateral of the nested
  * page table fault matches with our instruction decoding.
  */
@@ -2270,7 +2342,7 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 		 * instruction
 		 */
 		if (vie->base_register == VM_REG_GUEST_RIP)
-			base += vie->num_valid;
+			base += vie->num_processed;
 	}
 
 	idx = 0;
@@ -2323,9 +2395,6 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 	if (decode_moffset(vie))
 		return (-1);
 
-	if (verify_inst_length(vie))
-		return (-1);
-
 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
 		if (verify_gla(vm, cpuid, gla, vie))
 			return (-1);
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index fc68a61..63044e8 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -28,16 +28,10 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/types.h>
-#include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/systm.h>
 
-#include <vm/vm.h>
-
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
-#include <x86/psl.h>
 
 #include "vatpic.h"
 #include "vatpit.h"
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
index 9ecf9af..7e2f64d 100644
--- a/sys/amd64/vmm/vmm_stat.c
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -33,7 +33,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
-#include <sys/smp.h>
 
 #include <machine/vmm.h>
 #include "vmm_util.h"
@@ -165,6 +164,7 @@ VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault");
 VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation");
 VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
 VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
+VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit");
 VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
 VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit");
 VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions");
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
index 1640ba3..c695840 100644
--- a/sys/amd64/vmm/vmm_stat.h
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -157,4 +157,5 @@ VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
 VMM_STAT_DECLARE(VMEXIT_USERSPACE);
 VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
 VMM_STAT_DECLARE(VMEXIT_EXCEPTION);
+VMM_STAT_DECLARE(VMEXIT_REQIDLE);
 #endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index c37d21c..525e1d9 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
@@ -231,10 +230,11 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
-			 * Don't expose VMX, SpeedStep or TME capability.
+			 * Don't expose VMX, SpeedStep, TME or SMX capability.
 			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+			regs[2] &= ~(CPUID2_SMX);
 
 			regs[2] |= CPUID2_HV;
 
@@ -286,18 +286,20 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
-			
+
 			/*
-			 * Machine check handling is done in the host.
-			 * Hide MTRR capability.
+			 * Hide the debug store capability.
 			 */
-			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
-
-                        /*
-                        * Hide the debug store capability.
-                        */
 			regs[3] &= ~CPUID_DS;
 
+			/*
+			 * Advertise the Machine Check and MTRR capability.
+			 *
+			 * Some guest OSes (e.g. Windows) will not boot if
+			 * these features are absent.
+			 */
+			regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
 			logical_cpus = threads_per_core * cores_per_package;
 			regs[1] &= ~CPUID_HTT_CORES;
 			regs[1] |= (logical_cpus & 0xff) << 16;
@@ -486,3 +488,34 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 
 	return (1);
 }
+
+bool
+vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
+{
+	bool rv;
+
+	KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
+	    __func__, cap));
+
+	/*
+	 * Simply passthrough the capabilities of the host cpu for now.
+	 */
+	rv = false;
+	switch (cap) {
+	case VCC_NO_EXECUTE:
+		if (amd_feature & AMDID_NX)
+			rv = true;
+		break;
+	case VCC_FFXSR:
+		if (amd_feature & AMDID_FFXSR)
+			rv = true;
+		break;
+	case VCC_TCE:
+		if (amd_feature2 & AMDID2_TCE)
+			rv = true;
+		break;
+	default:
+		panic("%s: unknown vm_cpu_capability %d", __func__, cap);
+	}
+	return (rv);
+}
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
index 8401c15..6f99d52 100644
--- a/sys/amd64/vmm/x86.h
+++ b/sys/amd64/vmm/x86.h
@@ -62,4 +62,17 @@
 int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
 		      uint32_t *ecx, uint32_t *edx);
 
+enum vm_cpuid_capability {
+	VCC_NONE,
+	VCC_NO_EXECUTE,
+	VCC_FFXSR,
+	VCC_TCE,
+	VCC_LAST
+};
+
+/*
+ * Return 'true' if the capability 'cap' is enabled in this virtual cpu
+ * and 'false' otherwise.
+ */
+bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability);
 #endif
-- 
cgit v1.1