MFC r270326

Fix a recursive lock acquisition in vi_reset_dev(). MFC r270434 Return the spurious interrupt vector (IRQ7 or IRQ15) if the atpic cannot find any unmasked pin with an interrupt asserted. MFC r270436 Fix a bug in the emulation of CPUID leaf 0x4. MFC r270437 Add "hw.vmm.topology.threads_per_core" and "hw.vmm.topology.cores_per_package" tunables to modify the default cpu topology advertised by bhyve. MFC r270855 Set the 'inst_length' to '0' early on before any error conditions are detected in the emulation of the task switch. If any exceptions are triggered then the guest %rip should point to instruction that caused the task switch as opposed to the one after it. MFC r270857 The "SUB" instruction used in getcc() actually does 'x -= y' so use the proper constraint for 'x'. The "+r" constraint indicates that 'x' is an input and output register operand. While here generate code for different variants of getcc() using a macro GETCC(sz) where 'sz' indicates the operand size. Update the status bits in %rflags when emulating AND and OR opcodes. MFC r271439 Initialize 'bc_rdonly' to the right value. MFC r271451 Optimize the common case of injecting an interrupt into a vcpu after a HLT by explicitly moving it out of the interrupt shadow. MFC r271888 Restructure the MSR handling so it is entirely handled by processor-specific code. MFC r271890 MSR_KGSBASE is no longer saved and restored from the guest MSR save area. This behavior was changed in r271888 so update the comment block to reflect this. MFC r271891 Add some more KTR events to help debugging. MFC r272197 mmap(2) requires either MAP_PRIVATE or MAP_SHARED for non-anonymous mappings. MFC r272395 Get rid of code that dealt with the hardware not being able to save/restore the PAT MSR on guest exit/entry. This workaround was done for a beta release of VMware Fusion 5 but is no longer needed in later versions. All Intel CPUs since Nehalem have supported saving and restoring MSR_PAT in the VM exit and entry controls. MFC r272670 Inject #UD into the guest when it executes either 'MONITOR' or 'MWAIT'. MFC r272710 Implement the FLUSH operation in the virtio-block emulation. MFC r272838 iasl(8) expects integer fields in data tables to be specified as hexadecimal values. Therefore the bit width of the "PM Timer Block" was actually being interpreted as 50-bits instead of the expected 32-bit. This eliminates an error message emitted by a Linux 3.17 guest during boot: "Invalid length for FADT/PmTimerBlock: 50, using default 32" MFC r272839 Support Intel-specific MSRs that are accessed when booting up a linux in bhyve: - MSR_PLATFORM_INFO - MSR_TURBO_RATIO_LIMITx - MSR_RAPL_POWER_UNIT MFC r273108 Emulate "POP r/m". This is needed to boot OpenBSD/i386 MP kernel in bhyve. MFC r273212 Support stopping and restarting the AHCI command list via toggling PxCMD.ST from '1' to '0' and back. This allows the driver a chance to recover if for instance a timeout occurred due to activity on the host.
author: neel <neel@FreeBSD.org> 2014-12-28 21:27:13 +0000
committer: neel <neel@FreeBSD.org> 2014-12-28 21:27:13 +0000
commit: 88c1adb41738babfd568dce3befb4b0b1b9fd799 (patch)
tree: 3935974ca9583376cc712bb4162a13e099846c96
parent: 585f5c8ddaef5e9b9ba675ab11a5e8481aa0c425 (diff)
download: FreeBSD-src-88c1adb41738babfd568dce3befb4b0b1b9fd799.zip
FreeBSD-src-88c1adb41738babfd568dce3befb4b0b1b9fd799.tar.gz
26 files changed, 896 insertions, 568 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 58af2a5..0879ba2 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -82,6 +82,7 @@ enum vm_reg_name {
 	VM_REG_GUEST_PDPTE1,
 	VM_REG_GUEST_PDPTE2,
 	VM_REG_GUEST_PDPTE3,
+	VM_REG_GUEST_INTR_SHADOW,
 	VM_REG_LAST
 };
 
@@ -194,7 +195,6 @@ void vm_nmi_clear(struct vm *vm, int vcpuid);
 int vm_inject_extint(struct vm *vm, int vcpu);
 int vm_extint_pending(struct vm *vm, int vcpuid);
 void vm_extint_clear(struct vm *vm, int vcpuid);
-uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
@@ -485,6 +485,8 @@ enum vm_exitcode {
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
 	VM_EXITCODE_TASK_SWITCH,
+	VM_EXITCODE_MONITOR,
+	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_MAX
 };
 
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
index 5f6c4d0..13c9788 100644
--- a/sys/amd64/vmm/intel/ept.c
+++ b/sys/amd64/vmm/intel/ept.c
@@ -44,7 +44,6 @@ __FBSDID("$FreeBSD$");
 
 #include "vmx_cpufunc.h"
 #include "vmm_ipi.h"
-#include "vmx_msr.h"
 #include "ept.h"
 
 #define	EPT_SUPPORTS_EXEC_ONLY(cap)	((cap) & (1UL << 0))
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 4e9557c..6122de5 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -54,6 +54,10 @@ int	vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
 int	vmcs_setdesc(struct vmcs *vmcs, int running, int ident,
 		     struct seg_desc *desc);
 
+/*
+ * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h
+ */
+#ifdef _VMX_CPUFUNC_H_
 static __inline uint64_t
 vmcs_read(uint32_t encoding)
 {
@@ -73,6 +77,7 @@ vmcs_write(uint32_t encoding, uint64_t val)
 	error = vmwrite(encoding, val);
 	KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error));
 }
+#endif	/* _VMX_CPUFUNC_H_ */
 
 #define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
 #define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index b2c5702..2fe5a27 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -52,20 +52,20 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
+#include "vmm_lapic.h"
 #include "vmm_host.h"
 #include "vmm_ioport.h"
 #include "vmm_ipi.h"
-#include "vmm_msr.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
-#include "vmx_msr.h"
 #include "ept.h"
 #include "vmx_cpufunc.h"
 #include "vmx.h"
+#include "vmx_msr.h"
 #include "x86.h"
 #include "vmx_controls.h"
 
@@ -81,6 +81,8 @@ __FBSDID("$FreeBSD$");
 
 #define	PROCBASED_CTLS_ONE_SETTING 					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_MWAIT_EXITING	|				\
+	 PROCBASED_MONITOR_EXITING	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
@@ -94,34 +96,23 @@ __FBSDID("$FreeBSD$");
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
-#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
+#define	VM_EXIT_CTLS_ONE_SETTING					\
 	(VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
-	VM_EXIT_LOAD_EFER)
-
-#define	VM_EXIT_CTLS_ONE_SETTING					\
-	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
+	VM_EXIT_LOAD_EFER			|			\
 	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
 	VM_EXIT_SAVE_PAT			|			\
 	VM_EXIT_LOAD_PAT)
+
 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
 
-#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
+#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT)
 
-#define	VM_ENTRY_CTLS_ONE_SETTING					\
-	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
-	VM_ENTRY_LOAD_PAT)
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
-#define	guest_msr_rw(vmx, msr) \
-	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
-
-#define	guest_msr_ro(vmx, msr) \
-    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
-
 #define	HANDLED		1
 #define	UNHANDLED	0
 
@@ -158,10 +149,6 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
  */
 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
 
-static int vmx_patmsr;
-SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
-    "PAT MSR saved and restored in VCMS");
-
 static int cap_halt_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
     "HLT triggers a VM-exit");
@@ -208,6 +195,7 @@ SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
+static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
 static void vmx_inject_pir(struct vlapic *vlapic);
 
 #ifdef KTR
@@ -475,22 +463,6 @@ vpid_init(void)
 }
 
 static void
-msr_save_area_init(struct msr_entry *g_area, int *g_count)
-{
-	int cnt;
-
-	static struct msr_entry guest_msrs[] = {
-		{ MSR_KGSBASE, 0, 0 },
-	};
-
-	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
-	if (cnt > GUEST_MSR_MAX_ENTRIES)
-		panic("guest msr save area overrun");
-	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
-	*g_count = cnt;
-}
-
-static void
 vmx_disable(void *arg __unused)
 {
 	struct invvpid_desc invvpid_desc = { 0 };
@@ -636,49 +608,24 @@ vmx_init(int ipinum)
 	}
 
 	/* Check support for VM-exit controls */
-	vmx_patmsr = 1;
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
-		/* Try again without the PAT MSR bits */
-		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
-				       MSR_VMX_TRUE_EXIT_CTLS,
-				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
-				       VM_EXIT_CTLS_ZERO_SETTING,
-				       &exit_ctls);
-		if (error) {
-			printf("vmx_init: processor does not support desired "
-			       "exit controls\n");
-			return (error);
-		} else {
-			if (bootverbose)
-				printf("vmm: PAT MSR access not supported\n");
-			guest_msr_valid(MSR_PAT);
-			vmx_patmsr = 0;
-		}
+		printf("vmx_init: processor does not support desired "
+		    "exit controls\n");
+		return (error);
 	}
 
 	/* Check support for VM-entry controls */
-	if (vmx_patmsr) {
-		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
-				       MSR_VMX_TRUE_ENTRY_CTLS,
-				       VM_ENTRY_CTLS_ONE_SETTING,
-				       VM_ENTRY_CTLS_ZERO_SETTING,
-				       &entry_ctls);
-	} else {
-		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
-				       MSR_VMX_TRUE_ENTRY_CTLS,
-				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
-				       VM_ENTRY_CTLS_ZERO_SETTING,
-				       &entry_ctls);
-	}
-
+	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
+	    &entry_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
-		       "entry controls\n");
-		       return (error);
+		    "entry controls\n");
+		return (error);
 	}
 
 	/*
@@ -800,6 +747,8 @@ vmx_init(int ipinum)
 
 	vpid_init();
 
+	vmx_msr_init();
+
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
 
@@ -869,7 +818,7 @@ static void *
 vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
-	int i, error, guest_msr_count;
+	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 
@@ -905,16 +854,14 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * how they are saved/restored so can be directly accessed by the
 	 * guest.
 	 *
-	 * Guest KGSBASE is saved and restored in the guest MSR save area.
-	 * Host KGSBASE is restored before returning to userland from the pcb.
-	 * There will be a window of time when we are executing in the host
-	 * kernel context with a value of KGSBASE from the guest. This is ok
-	 * because the value of KGSBASE is inconsequential in kernel context.
-	 *
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit.
+	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as that
 	 * will impact the host TSC.
 	 * XXX Writes would be implemented with a wrmsr trap, and
@@ -925,21 +872,11 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
-	    guest_msr_rw(vmx, MSR_KGSBASE) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
+	    guest_msr_rw(vmx, MSR_PAT) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
-	/*
-	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
-	 * and entry respectively. It is also restored from the host VMCS
-	 * area on a VM exit. However, if running on a system with no
-	 * MSR_PAT save/restore support, leave access disabled so accesses
-	 * will be trapped.
-	 */
-	if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
-		panic("vmx_vminit: error setting guest pat msr access");
-
 	vpid_alloc(vpid, VM_MAXCPU);
 
 	if (virtual_interrupt_delivery) {
@@ -958,6 +895,8 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 			      error, i);
 		}
 
+		vmx_msr_guest_init(vmx, i);
+
 		error = vmcs_init(vmcs);
 		KASSERT(error == 0, ("vmcs_init error %d", error));
 
@@ -996,13 +935,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
-		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
-
-		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
-		    guest_msr_count);
-		if (error != 0)
-			panic("vmcs_set_msr_save error %d", error);
-
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
 		 * to the power-on register value from the Intel Sys Arch.
@@ -2078,6 +2010,46 @@ vmx_task_switch_reason(uint64_t qual)
 }
 
 static int
+emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
+	else
+		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
+
+	return (error);
+}
+
+static int
+emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
+{
+	struct vmxctx *vmxctx;
+	uint64_t result;
+	uint32_t eax, edx;
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
+	else
+		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
+
+	if (error == 0) {
+		eax = result;
+		vmxctx = &vmx->ctx[vcpuid];
+		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
+		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
+
+		edx = result >> 32;
+		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
+		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
+	}
+
+	return (error);
+}
+
+static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int error, handled, in;
@@ -2215,7 +2187,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
-		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
+		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
@@ -2224,7 +2196,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
-			    ("emulate_wrmsr retu with bogus exitcode"));
+			    ("emulate_rdmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_WRMSR:
@@ -2235,7 +2207,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
-		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+		error = emulate_wrmsr(vmx, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
@@ -2403,6 +2375,12 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	case EXIT_REASON_XSETBV:
 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 		break;
+	case EXIT_REASON_MONITOR:
+		vmexit->exitcode = VM_EXITCODE_MONITOR;
+		break;
+	case EXIT_REASON_MWAIT:
+		vmexit->exitcode = VM_EXITCODE_MWAIT;
+		break;
 	default:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
@@ -2523,6 +2501,8 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
+	vmx_msr_guest_enter(vmx, vcpu);
+
 	VMPTRLD(vmcs);
 
 	/*
@@ -2624,6 +2604,8 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
+	vmx_msr_guest_exit(vmx, vcpu);
+
 	return (0);
 }
 
@@ -2712,6 +2694,46 @@ vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 }
 
 static int
+vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
+{
+	uint64_t gi;
+	int error;
+
+	error = vmcs_getreg(&vmx->vmcs[vcpu], running, 
+	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
+	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
+	return (error);
+}
+
+static int
+vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
+{
+	struct vmcs *vmcs;
+	uint64_t gi;
+	int error, ident;
+
+	/*
+	 * Forcing the vcpu into an interrupt shadow is not supported.
+	 */
+	if (val) {
+		error = EINVAL;
+		goto done;
+	}
+
+	vmcs = &vmx->vmcs[vcpu];
+	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
+	error = vmcs_getreg(vmcs, running, ident, &gi);
+	if (error == 0) {
+		gi &= ~HWINTR_BLOCKING;
+		error = vmcs_setreg(vmcs, running, ident, gi);
+	}
+done:
+	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
+	    error ? "failed" : "succeeded");
+	return (error);
+}
+
+static int
 vmx_shadow_reg(int reg)
 {
 	int shreg;
@@ -2742,6 +2764,9 @@ vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
+
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
@@ -2760,6 +2785,9 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
+
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index 208fcee..2124554 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -33,8 +33,6 @@
 
 struct pmap;
 
-#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
-
 struct vmxctx {
 	register_t	guest_rdi;		/* Guest state */
 	register_t	guest_rsi;
@@ -97,13 +95,23 @@ struct pir_desc {
 } __aligned(64);
 CTASSERT(sizeof(struct pir_desc) == 64);
 
+/* Index into the 'guest_msrs[]' array */
+enum {
+	IDX_MSR_LSTAR,
+	IDX_MSR_CSTAR,
+	IDX_MSR_STAR,
+	IDX_MSR_SF_MASK,
+	IDX_MSR_KGSBASE,
+	GUEST_MSR_NUM		/* must be the last enumeration */
+};
+
 /* virtual machine softc */
 struct vmx {
 	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
 	struct apic_page apic_page[VM_MAXCPU];	/* one apic page per vcpu */
 	char		msr_bitmap[PAGE_SIZE];
 	struct pir_desc	pir_desc[VM_MAXCPU];
-	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+	uint64_t	guest_msrs[VM_MAXCPU][GUEST_MSR_NUM];
 	struct vmxctx	ctx[VM_MAXCPU];
 	struct vmxcap	cap[VM_MAXCPU];
 	struct vmxstate	state[VM_MAXCPU];
@@ -113,7 +121,6 @@ struct vmx {
 };
 CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
-CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
 CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
 
 #define	VMX_GUEST_VMEXIT	0
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index a3428db..746ca73 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -31,10 +31,15 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/cpuset.h>
 
+#include <machine/clock.h>
 #include <machine/cpufunc.h>
+#include <machine/md_var.h>
 #include <machine/specialreg.h>
+#include <machine/vmm.h>
 
+#include "vmx.h"
 #include "vmx_msr.h"
 
 static boolean_t
@@ -171,3 +176,213 @@ msr_bitmap_change_access(char *bitmap, u_int msr, int access)
 
 	return (0);
 }
+
+static uint64_t misc_enable;
+static uint64_t platform_info;
+static uint64_t turbo_ratio_limit;
+static uint64_t host_msrs[GUEST_MSR_NUM];
+
+static bool
+nehalem_cpu(void)
+{
+	u_int family, model;
+
+	/*
+	 * The family:model numbers belonging to the Nehalem microarchitecture
+	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
+	 */
+	family = CPUID_TO_FAMILY(cpu_id);
+	model = CPUID_TO_MODEL(cpu_id);
+	if (family == 0x6) {
+		switch (model) {
+		case 0x1A:
+		case 0x1E:
+		case 0x1F:
+		case 0x2E:
+			return (true);
+		default:
+			break;
+		}
+	}
+	return (false);
+}
+
+static bool
+westmere_cpu(void)
+{
+	u_int family, model;
+
+	/*
+	 * The family:model numbers belonging to the Westmere microarchitecture
+	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
+	 */
+	family = CPUID_TO_FAMILY(cpu_id);
+	model = CPUID_TO_MODEL(cpu_id);
+	if (family == 0x6) {
+		switch (model) {
+		case 0x25:
+		case 0x2C:
+			return (true);
+		default:
+			break;
+		}
+	}
+	return (false);
+}
+
+void
+vmx_msr_init(void)
+{
+	uint64_t bus_freq, ratio;
+	int i;
+
+	/*
+	 * It is safe to cache the values of the following MSRs because
+	 * they don't change based on curcpu, curproc or curthread.
+	 */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+
+	/*
+	 * Initialize emulated MSRs
+	 */
+	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
+	/*
+	 * Set mandatory bits
+	 *  11:   branch trace disabled
+	 *  12:   PEBS unavailable
+	 * Clear unsupported features
+	 *  16:   SpeedStep enable
+	 *  18:   enable MONITOR FSM
+	 */
+	misc_enable |= (1 << 12) | (1 << 11);
+	misc_enable &= ~((1 << 18) | (1 << 16));
+
+	if (nehalem_cpu() || westmere_cpu())
+		bus_freq = 133330000;		/* 133Mhz */
+	else
+		bus_freq = 100000000;		/* 100Mhz */
+
+	/*
+	 * XXXtime
+	 * The ratio should really be based on the virtual TSC frequency as
+	 * opposed to the host TSC.
+	 */
+	ratio = (tsc_freq / bus_freq) & 0xff;
+
+	/*
+	 * The register definition is based on the micro-architecture
+	 * but the following bits are always the same:
+	 * [15:8]  Maximum Non-Turbo Ratio
+	 * [28]    Programmable Ratio Limit for Turbo Mode
+	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
+	 * [47:40] Maximum Efficiency Ratio
+	 *
+	 * The other bits can be safely set to 0 on all
+	 * micro-architectures up to Haswell.
+	 */
+	platform_info = (ratio << 8) | (ratio << 40);
+
+	/*
+	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
+	 * dependent on the maximum cores per package supported by the micro-
+	 * architecture. For e.g., Westmere supports 6 cores per package and
+	 * uses the low 48 bits. Sandybridge support 8 cores per package and
+	 * uses up all 64 bits.
+	 *
+	 * However, the unused bits are reserved so we pretend that all bits
+	 * in this MSR are valid.
+	 */
+	for (i = 0; i < 8; i++)
+		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
+}
+
+void
+vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
+{
+	/*
+	 * The permissions bitmap is shared between all vcpus so initialize it
+	 * once when initializing the vBSP.
+	 */
+	if (vcpuid == 0) {
+		guest_msr_rw(vmx, MSR_LSTAR);
+		guest_msr_rw(vmx, MSR_CSTAR);
+		guest_msr_rw(vmx, MSR_STAR);
+		guest_msr_rw(vmx, MSR_SF_MASK);
+		guest_msr_rw(vmx, MSR_KGSBASE);
+	}
+	return;
+}
+
+void
+vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
+{
+	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+	/* Save host MSRs (if any) and restore guest MSRs */
+	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
+	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
+}
+
+void
+vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
+{
+	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+	/* Save guest MSRs */
+	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
+
+	/* Restore host MSRs */
+	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
+
+	/* MSR_KGSBASE will be restored on the way back to userspace */
+}
+
+int
+vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	case MSR_IA32_MISC_ENABLE:
+		*val = misc_enable;
+		break;
+	case MSR_PLATFORM_INFO:
+		*val = platform_info;
+		break;
+	case MSR_TURBO_RATIO_LIMIT:
+	case MSR_TURBO_RATIO_LIMIT1:
+		*val = turbo_ratio_limit;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+int
+vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
index 340b0f7..e77881c 100644
--- a/sys/amd64/vmm/intel/vmx_msr.h
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -29,6 +29,15 @@
 #ifndef _VMX_MSR_H_
 #define	_VMX_MSR_H_
 
+struct vmx;
+
+void vmx_msr_init(void);
+void vmx_msr_guest_init(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid);
+int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu);
+int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu);
+
 uint32_t vmx_revision(void);
 
 int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
@@ -52,4 +61,10 @@ int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
 void	msr_bitmap_initialize(char *bitmap);
 int	msr_bitmap_change_access(char *bitmap, u_int msr, int access);
 
+#define	guest_msr_rw(vmx, msr) \
+    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define	guest_msr_ro(vmx, msr) \
+    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
+
 #endif
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index 15620d5..b710a84 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -500,13 +500,19 @@ vatpic_pending_intr(struct vm *vm, int *vecptr)
 	VATPIC_LOCK(vatpic);
 
 	pin = vatpic_get_highest_irrpin(atpic);
-	if (pin == -1)
-		pin = 7;
 	if (pin == 2) {
 		atpic = &vatpic->atpic[1];
 		pin = vatpic_get_highest_irrpin(atpic);
 	}
 
+	/*
+	 * If there are no pins active at this moment then return the spurious
+	 * interrupt vector instead.
+	 */
+	if (pin == -1)
+		pin = 7;
+
+	KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin));
 	*vecptr = atpic->irq_base + pin;
 
 	VATPIC_UNLOCK(vatpic);
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 3c93463..d684dba 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -633,6 +633,7 @@ vlapic_fire_timer(struct vlapic *vlapic)
 	// The timer LVT always uses the fixed delivery mode.
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
 	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+		VLAPIC_CTR0(vlapic, "vlapic timer fired");
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
 	}
 }
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index fa0200e..ddf875b 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -74,7 +74,6 @@ __FBSDID("$FreeBSD$");
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
-#include "vmm_msr.h"
 #include "vmm_ipi.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
@@ -105,7 +104,6 @@ struct vcpu {
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
-	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 };
 
@@ -188,7 +186,6 @@ static struct vmm_ops *ops;
 #define	fpu_stop_emulating()	clts()
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
-CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
@@ -250,7 +247,6 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
-	guest_msrs_init(vm, vcpu_id);
 }
 
 struct vm_exit *
@@ -294,7 +290,6 @@ vmm_init(void)
 	else
 		return (ENXIO);
 
-	vmm_msr_init();
 	vmm_resume_p = vmm_resume;
 
 	return (VMM_INIT(vmm_ipinum));
@@ -1091,7 +1086,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
-	int t, vcpu_halted, vm_halted;
+	int error, t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
@@ -1099,6 +1094,22 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 	vcpu_halted = 0;
 	vm_halted = 0;
 
+	/*
+	 * The typical way to halt a cpu is to execute: "sti; hlt"
+	 *
+	 * STI sets RFLAGS.IF to enable interrupts. However, the processor
+	 * remains in an "interrupt shadow" for an additional instruction
+	 * following the STI. This guarantees that "sti; hlt" sequence is
+	 * atomic and a pending interrupt will be recognized after the HLT.
+	 *
+	 * After the HLT emulation is done the vcpu is no longer in an
+	 * interrupt shadow and a pending interrupt can be injected on
+	 * the next entry into the guest.
+	 */
+	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
+	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
+	    __func__, error));
+
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
@@ -1187,8 +1198,12 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
-		if (rv == 0)
+		if (rv == 0) {
+			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
+			    ftype == VM_PROT_READ ? "accessed" : "dirty",
+			    vme->u.paging.gpa);
 			goto done;
+		}
 	}
 
 	map = &vm->vmspace->vm_map;
@@ -1229,6 +1244,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
+	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
+
 	vie_init(vie);
 
 	/* Fetch, decode and emulate the faulting instruction */
@@ -1425,7 +1442,6 @@ restart:
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
-	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
@@ -1433,7 +1449,6 @@ restart:
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
-	restore_host_msrs(vm, vcpuid);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
@@ -1467,6 +1482,10 @@ restart:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
+		case VM_EXITCODE_MONITOR:
+		case VM_EXITCODE_MWAIT:
+			vm_inject_ud(vm, vcpuid);
+			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
@@ -1875,12 +1894,6 @@ vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
 }
 
-uint64_t *
-vm_guest_msrs(struct vm *vm, int cpu)
-{
-	return (vm->vcpu[cpu].guest_msrs);
-}
-
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 09453a2..c6ba01e 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -69,6 +69,7 @@ enum {
 	VIE_OP_TYPE_TWO_BYTE,
 	VIE_OP_TYPE_PUSH,
 	VIE_OP_TYPE_CMP,
+	VIE_OP_TYPE_POP,
 	VIE_OP_TYPE_LAST
 };
 
@@ -159,6 +160,11 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_OR,
 		.op_flags = VIE_OP_F_IMM8,
 	},
+	[0x8F] = {
+		/* XXX Group 1A extended opcode - not just POP */
+		.op_byte = 0x8F,
+		.op_type = VIE_OP_TYPE_POP,
+	},
 	[0xFF] = {
 		/* XXX Group 5 extended opcode - not just PUSH */
 		.op_byte = 0xFF,
@@ -316,46 +322,36 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 	return (error);
 }
 
+#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
 /*
  * Return the status flags that would result from doing (x - y).
  */
-static u_long
-getcc16(uint16_t x, uint16_t y)
-{
-	u_long rflags;
-
-	__asm __volatile("sub %1,%2; pushfq; popq %0" :
-	    "=r" (rflags) : "m" (y), "r" (x));
-	return (rflags);
-}
-
-static u_long
-getcc32(uint32_t x, uint32_t y)
-{
-	u_long rflags;
-
-	__asm __volatile("sub %1,%2; pushfq; popq %0" :
-	    "=r" (rflags) : "m" (y), "r" (x));
-	return (rflags);
-}
-
-static u_long
-getcc64(uint64_t x, uint64_t y)
-{
-	u_long rflags;
-
-	__asm __volatile("sub %1,%2; pushfq; popq %0" :
-	    "=r" (rflags) : "m" (y), "r" (x));
-	return (rflags);
-}
+#define	GETCC(sz)							\
+static u_long								\
+getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
+{									\
+	u_long rflags;							\
+									\
+	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
+	    "=r" (rflags), "+r" (x) : "m" (y));				\
+	return (rflags);						\
+} struct __hack
+
+GETCC(8);
+GETCC(16);
+GETCC(32);
+GETCC(64);
 
 static u_long
 getcc(int opsize, uint64_t x, uint64_t y)
 {
-	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
 	    ("getcc: invalid operand size %d", opsize));
 
-	if (opsize == 2)
+	if (opsize == 1)
+		return (getcc8(x, y));
+	else if (opsize == 2)
 		return (getcc16(x, y));
 	else if (opsize == 4)
 		return (getcc32(x, y));
@@ -569,7 +565,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 {
 	int error, size;
 	enum vm_reg_name reg;
-	uint64_t val1, val2;
+	uint64_t result, rflags, rflags2, val1, val2;
 
 	size = vie->opsize;
 	error = EINVAL;
@@ -597,8 +593,8 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 			break;
 
 		/* perform the operation and write the result */
-		val1 &= val2;
-		error = vie_update_register(vm, vcpuid, reg, val1, size);
+		result = val1 & val2;
+		error = vie_update_register(vm, vcpuid, reg, result, size);
 		break;
 	case 0x81:
 		/*
@@ -625,11 +621,11 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		switch (vie->reg & 7) {
 		case 0x4:
 			/* modrm:reg == b100, AND */
-			val1 &= vie->immediate;
+			result = val1 & vie->immediate;
 			break;
 		case 0x1:
 			/* modrm:reg == b001, OR */
-			val1 |= vie->immediate;
+			result = val1 | vie->immediate;
 			break;
 		default:
 			error = EINVAL;
@@ -638,11 +634,29 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		if (error)
 			break;
 
-		error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+		error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+
+	/*
+	 * OF and CF are cleared; the SF, ZF and PF flags are set according
+	 * to the result; AF is undefined.
+	 *
+	 * The updated status flags are obtained by subtracting 0 from 'result'.
+	 */
+	rflags2 = getcc(size, result, 0);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
@@ -651,7 +665,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
-	uint64_t val1;
+	uint64_t val1, result, rflags, rflags2;
 
 	size = vie->opsize;
 	error = EINVAL;
@@ -681,17 +695,33 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * perform the operation with the pre-fetched immediate
 		 * operand and write the result
 		 */
-                val1 |= vie->immediate;
-                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+                result = val1 | vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+
+	/*
+	 * OF and CF are cleared; the SF, ZF and PF flags are set according
+	 * to the result; AF is undefined.
+	 *
+	 * The updated status flags are obtained by subtracting 0 from 'result'.
+	 */
+	rflags2 = getcc(size, result, 0);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
-#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
-
 static int
 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -797,7 +827,7 @@ emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 }
 
 static int
-emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
@@ -808,18 +838,12 @@ emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 #endif
 	struct seg_desc ss_desc;
 	uint64_t cr0, rflags, rsp, stack_gla, val;
-	int error, size, stackaddrsize;
-
-	/*
-	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
-	 *
-	 * PUSH is part of the group 5 extended opcodes and is identified
-	 * by ModRM:reg = b110.
-	 */
-	if ((vie->reg & 7) != 6)
-		return (EINVAL);
+	int error, size, stackaddrsize, pushop;
 
+	val = 0;
 	size = vie->opsize;
+	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
+
 	/*
 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
 	 */
@@ -858,10 +882,13 @@ emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+	if (pushop) {
+		rsp -= size;
+	}
 
-	rsp -= size;
 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
-	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
+	    &stack_gla)) {
 		vm_inject_ss(vm, vcpuid, 0);
 		return (0);
 	}
@@ -876,8 +903,8 @@ emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 		return (0);
 	}
 
-	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
-	    copyinfo, nitems(copyinfo));
+	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
+	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
 	if (error == -1) {
 		/*
 		 * XXX cannot return a negative error value here because it
@@ -890,16 +917,66 @@ emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 		return (0);
 	}
 
-	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+	if (pushop) {
+		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+		if (error == 0)
+			vm_copyout(vm, vcpuid, &val, copyinfo, size);
+	} else {
+		vm_copyin(vm, vcpuid, copyinfo, &val, size);
+		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
+		rsp += size;
+	}
+#ifdef _KERNEL
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+
 	if (error == 0) {
-		vm_copyout(vm, vcpuid, &val, copyinfo, size);
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
 		    stackaddrsize);
 		KASSERT(error == 0, ("error %d updating rsp", error));
 	}
-#ifdef _KERNEL
-	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-#endif
+	return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * PUSH is part of the group 5 extended opcodes and is identified
+	 * by ModRM:reg = b110.
+	 */
+	if ((vie->reg & 7) != 6)
+		return (EINVAL);
+
+	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+	    memwrite, arg);
+	return (error);
+}
+
+static int
+emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * POP is part of the group 1A extended opcodes and is identified
+	 * by ModRM:reg = b000.
+	 */
+	if ((vie->reg & 7) != 0)
+		return (EINVAL);
+
+	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+	    memwrite, arg);
 	return (error);
 }
 
@@ -914,6 +991,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_POP:
+		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_PUSH:
 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
deleted file mode 100644
index 03e0071..0000000
--- a/sys/amd64/vmm/vmm_msr.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/smp.h>
-
-#include <machine/specialreg.h>
-
-#include <machine/vmm.h>
-#include "vmm_lapic.h"
-#include "vmm_msr.h"
-
-#define	VMM_MSR_F_EMULATE	0x01
-#define	VMM_MSR_F_READONLY	0x02
-#define VMM_MSR_F_INVALID	0x04  /* guest_msr_valid() can override this */
-
-struct vmm_msr {
-	int		num;
-	int		flags;
-	uint64_t	hostval;
-};
-
-static struct vmm_msr vmm_msr[] = {
-	{ MSR_LSTAR,	0 },
-	{ MSR_CSTAR,	0 },
-	{ MSR_STAR,	0 },
-	{ MSR_SF_MASK,	0 },
-	{ MSR_PAT,      VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
-	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
-	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
-	{ MSR_IA32_PLATFORM_ID, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
-	{ MSR_IA32_MISC_ENABLE, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
-};
-
-#define	vmm_msr_num	(sizeof(vmm_msr) / sizeof(vmm_msr[0]))
-CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
-
-#define	readonly_msr(idx)	\
-	((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
-
-#define	emulated_msr(idx)	\
-	((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
-
-#define invalid_msr(idx)	\
-	((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
-
-void
-vmm_msr_init(void)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (emulated_msr(i))
-			continue;
-		/*
-		 * XXX this assumes that the value of the host msr does not
-		 * change after we have cached it.
-		 */
-		vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
-	}
-}
-
-void
-guest_msrs_init(struct vm *vm, int cpu)
-{
-	int i;
-	uint64_t *guest_msrs, misc;
-
-	guest_msrs = vm_guest_msrs(vm, cpu);
-	
-	for (i = 0; i < vmm_msr_num; i++) {
-		switch (vmm_msr[i].num) {
-		case MSR_LSTAR:
-		case MSR_CSTAR:
-		case MSR_STAR:
-		case MSR_SF_MASK:
-		case MSR_BIOS_SIGN:
-		case MSR_MCG_CAP:
-			guest_msrs[i] = 0;
-			break;
-		case MSR_PAT:
-			guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK)      |
-				PAT_VALUE(1, PAT_WRITE_THROUGH)   |
-				PAT_VALUE(2, PAT_UNCACHED)        |
-				PAT_VALUE(3, PAT_UNCACHEABLE)     |
-				PAT_VALUE(4, PAT_WRITE_BACK)      |
-				PAT_VALUE(5, PAT_WRITE_THROUGH)   |
-				PAT_VALUE(6, PAT_UNCACHED)        |
-				PAT_VALUE(7, PAT_UNCACHEABLE);
-			break;
-		case MSR_IA32_MISC_ENABLE:
-			misc = rdmsr(MSR_IA32_MISC_ENABLE);
-			/*
-			 * Set mandatory bits
-			 *  11:   branch trace disabled
-			 *  12:   PEBS unavailable
-			 * Clear unsupported features
-			 *  16:   SpeedStep enable
-			 *  18:   enable MONITOR FSM
-                         */
-			misc |= (1 << 12) | (1 << 11);
-			misc &= ~((1 << 18) | (1 << 16));
-			guest_msrs[i] = misc;
-			break;
-		case MSR_IA32_PLATFORM_ID:
-			guest_msrs[i] = 0;
-			break;
-		default:
-			panic("guest_msrs_init: missing initialization for msr "
-			      "0x%0x", vmm_msr[i].num);
-		}
-	}
-}
-
-static int
-msr_num_to_idx(u_int num)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++)
-		if (vmm_msr[i].num == num)
-			return (i);
-
-	return (-1);
-}
-
-int
-emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val, bool *retu)
-{
-	int idx;
-	uint64_t *guest_msrs;
-
-	if (lapic_msr(num))
-		return (lapic_wrmsr(vm, cpu, num, val, retu));
-
-	idx = msr_num_to_idx(num);
-	if (idx < 0 || invalid_msr(idx))
-		return (EINVAL);
-
-	if (!readonly_msr(idx)) {
-		guest_msrs = vm_guest_msrs(vm, cpu);
-
-		/* Stash the value */
-		guest_msrs[idx] = val;
-
-		/* Update processor state for non-emulated MSRs */
-		if (!emulated_msr(idx))
-			wrmsr(vmm_msr[idx].num, val);
-	}
-
-	return (0);
-}
-
-int
-emulate_rdmsr(struct vm *vm, int cpu, u_int num, bool *retu)
-{
-	int error, idx;
-	uint32_t eax, edx;
-	uint64_t result, *guest_msrs;
-
-	if (lapic_msr(num)) {
-		error = lapic_rdmsr(vm, cpu, num, &result, retu);
-		goto done;
-	}
-
-	idx = msr_num_to_idx(num);
-	if (idx < 0 || invalid_msr(idx)) {
-		error = EINVAL;
-		goto done;
-	}
-
-	guest_msrs = vm_guest_msrs(vm, cpu);
-	result = guest_msrs[idx];
-
-	/*
-	 * If this is not an emulated msr register make sure that the processor
-	 * state matches our cached state.
-	 */
-	if (!emulated_msr(idx) && (rdmsr(num) != result)) {
-		panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
-		      "(0x%016lx) and actual (0x%016lx) values", num,
-		      result, rdmsr(num));
-	}
-
-	error = 0;
-
-done:
-	if (error == 0) {
-		eax = result;
-		edx = result >> 32;
-		error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
-		if (error)
-			panic("vm_set_register(rax) error %d", error);
-		error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
-		if (error)
-			panic("vm_set_register(rdx) error %d", error);
-	}
-	return (error);
-}
-
-void
-restore_guest_msrs(struct vm *vm, int cpu)
-{
-	int i;
-	uint64_t *guest_msrs;
-
-	guest_msrs = vm_guest_msrs(vm, cpu);
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (emulated_msr(i))
-			continue;
-		else
-			wrmsr(vmm_msr[i].num, guest_msrs[i]);
-	}
-}
-
-void
-restore_host_msrs(struct vm *vm, int cpu)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (emulated_msr(i))
-			continue;
-		else
-			wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
-	}
-}
-
-/*
- * Must be called by the CPU-specific code before any guests are
- * created
- */
-void
-guest_msr_valid(int msr)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (vmm_msr[i].num == msr && invalid_msr(i)) {
-			vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
-		}
-	}
-}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
deleted file mode 100644
index e070037..0000000
--- a/sys/amd64/vmm/vmm_msr.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef	_VMM_MSR_H_
-#define	_VMM_MSR_H_
-
-#define	VMM_MSR_NUM	16
-struct vm;
-
-void	vmm_msr_init(void);
-int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val,
-	    bool *retu);
-int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr, bool *retu);
-void	guest_msrs_init(struct vm *vm, int cpu);
-void	guest_msr_valid(int msr);
-void	restore_host_msrs(struct vm *vm, int cpu);
-void	restore_guest_msrs(struct vm *vm, int cpu);
-
-#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index ef1557f..c7515cf 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
+#include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
@@ -45,20 +46,49 @@ __FBSDID("$FreeBSD$");
 #include "vmm_host.h"
 #include "x86.h"
 
+SYSCTL_DECL(_hw_vmm);
+static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
+
 #define	CPUID_VM_HIGH		0x40000000
 
 static const char bhyve_id[12] = "bhyve bhyve ";
 
 static uint64_t bhyve_xcpuids;
 
+/*
+ * The default CPU topology is a single thread per package.
+ */
+static u_int threads_per_core = 1;
+SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
+    &threads_per_core, 0, NULL);
+
+static u_int cores_per_package = 1;
+SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
+    &cores_per_package, 0, NULL);
+
+static int cpuid_leaf_b = 1;
+SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
+    &cpuid_leaf_b, 0, NULL);
+
+/*
+ * Round up to the next power of two, if necessary, and then take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+log2(u_int x)
+{
+
+	return (fls(x << (1 - powerof2(x))) - 1);
+}
+
 int
 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
 	const struct xsave_limits *limits;
 	uint64_t cr4;
-	int error, enable_invpcid;
-	unsigned int 	func, regs[4];
+	int error, enable_invpcid, level, width, x2apic_id;
+	unsigned int func, regs[4], logical_cpus;
 	enum x2apic_state x2apic_state;
 
 	/*
@@ -207,30 +237,31 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
                         */
 			regs[3] &= ~CPUID_DS;
 
-			/*
-			 * Disable multi-core.
-			 */
+			logical_cpus = threads_per_core * cores_per_package;
 			regs[1] &= ~CPUID_HTT_CORES;
-			regs[3] &= ~CPUID_HTT;
+			regs[1] |= (logical_cpus & 0xff) << 16;
+			regs[3] |= CPUID_HTT;
 			break;
 
 		case CPUID_0000_0004:
-			do_cpuid(4, regs);
+			cpuid_count(*eax, *ecx, regs);
 
-			/*
-			 * Do not expose topology.
-			 *
-			 * The maximum number of processor cores in
-			 * this physical processor package and the
-			 * maximum number of threads sharing this
-			 * cache are encoded with "plus 1" encoding.
-			 * Adding one to the value in this register
-			 * field to obtains the actual value.
-			 *
-			 * Therefore 0 for both indicates 1 core per
-			 * package and no cache sharing.
-			 */
-			regs[0] &= 0xffff8000;
+			if (regs[0] || regs[1] || regs[2] || regs[3]) {
+				regs[0] &= 0x3ff;
+				regs[0] |= (cores_per_package - 1) << 26;
+				/*
+				 * Cache topology:
+				 * - L1 and L2 are shared only by the logical
+				 *   processors in a single core.
+				 * - L3 and above are shared by all logical
+				 *   processors in the package.
+				 */
+				logical_cpus = threads_per_core;
+				level = (regs[0] >> 5) & 0x7;
+				if (level >= 3)
+					logical_cpus *= cores_per_package;
+				regs[0] |= (logical_cpus - 1) << 14;
+			}
 			break;
 
 		case CPUID_0000_0007:
@@ -284,10 +315,32 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			/*
 			 * Processor topology enumeration
 			 */
-			regs[0] = 0;
-			regs[1] = 0;
-			regs[2] = *ecx & 0xff;
-			regs[3] = vcpu_id;
+			if (*ecx == 0) {
+				logical_cpus = threads_per_core;
+				width = log2(logical_cpus);
+				level = CPUID_TYPE_SMT;
+				x2apic_id = vcpu_id;
+			}
+
+			if (*ecx == 1) {
+				logical_cpus = threads_per_core *
+				    cores_per_package;
+				width = log2(logical_cpus);
+				level = CPUID_TYPE_CORE;
+				x2apic_id = vcpu_id;
+			}
+
+			if (!cpuid_leaf_b || *ecx >= 2) {
+				width = 0;
+				logical_cpus = 0;
+				level = 0;
+				x2apic_id = 0;
+			}
+
+			regs[0] = width & 0x1f;
+			regs[1] = logical_cpus & 0xffff;
+			regs[2] = (level << 8) | (*ecx & 0xff);
+			regs[3] = x2apic_id;
 			break;
 
 		case CPUID_0000_000D:
diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile
index 76f9364..702587b 100644
--- a/sys/modules/vmm/Makefile
+++ b/sys/modules/vmm/Makefile
@@ -19,7 +19,6 @@ SRCS+=	vmm.c		\
 	vmm_ipi.c	\
 	vmm_lapic.c	\
 	vmm_mem.c	\
-	vmm_msr.c	\
 	vmm_stat.c	\
 	vmm_util.c	\
 	x86.c		\
diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h
index c920e82..7298a2e 100644
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@@ -381,6 +381,7 @@
 #define	MSR_BIOS_SIGN		0x08b
 #define	MSR_PERFCTR0		0x0c1
 #define	MSR_PERFCTR1		0x0c2
+#define	MSR_PLATFORM_INFO	0x0ce
 #define	MSR_MPERF		0x0e7
 #define	MSR_APERF		0x0e8
 #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
@@ -404,6 +405,8 @@
 #define	MSR_THERM_STATUS	0x19c
 #define	MSR_IA32_MISC_ENABLE	0x1a0
 #define	MSR_IA32_TEMPERATURE_TARGET	0x1a2
+#define	MSR_TURBO_RATIO_LIMIT	0x1ad
+#define	MSR_TURBO_RATIO_LIMIT1	0x1ae
 #define	MSR_DEBUGCTLMSR		0x1d9
 #define	MSR_LASTBRANCHFROMIP	0x1db
 #define	MSR_LASTBRANCHTOIP	0x1dc
@@ -437,6 +440,11 @@
 #define	MSR_MC4_STATUS		0x411
 #define	MSR_MC4_ADDR		0x412
 #define	MSR_MC4_MISC		0x413
+#define	MSR_RAPL_POWER_UNIT	0x606
+#define	MSR_PKG_ENERGY_STATUS	0x611
+#define	MSR_DRAM_ENERGY_STATUS	0x619
+#define	MSR_PP0_ENERGY_STATUS	0x639
+#define	MSR_PP1_ENERGY_STATUS	0x641
 
 /*
  * VMX MSRs
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
index 5dea300..c1f5f13 100644
--- a/usr.sbin/bhyve/acpi.c
+++ b/usr.sbin/bhyve/acpi.c
@@ -489,7 +489,7 @@ basl_fwrite_fadt(FILE *fp)
 	EFPRINTF(fp,
 	    "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
-	EFPRINTF(fp, "[0001]\t\tBit Width : 32\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
@@ -499,7 +499,7 @@ basl_fwrite_fadt(FILE *fp)
 
 	EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
-	EFPRINTF(fp, "[0001]\t\tBit Width : 80\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 7dcf6d0..b2b36bb 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -803,6 +803,12 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	error = init_msr();
+	if (error) {
+		fprintf(stderr, "init_msr error %d", error);
+		exit(1);
+	}
+
 	init_mem();
 	init_inout();
 	pci_irq_init(ctx);
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index 1ec0344..cbe5ac3 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -55,8 +55,7 @@ __FBSDID("$FreeBSD$");
 enum blockop {
 	BOP_READ,
 	BOP_WRITE,
-	BOP_FLUSH,
-	BOP_CANCEL
+	BOP_FLUSH
 };
 
 enum blockstat {
@@ -159,9 +158,6 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
 		break;
 	case BOP_FLUSH:
 		break;
-	case BOP_CANCEL:
-		err = EINTR;
-		break;
 	default:
 		err = EINVAL;
 		break;
@@ -278,6 +274,7 @@ blockif_open(const char *optstr, const char *ident)
 
 	bc->bc_magic = BLOCKIF_SIG;
 	bc->bc_fd = fd;
+	bc->bc_rdonly = ro;
 	bc->bc_size = size;
 	bc->bc_sectsz = sectsz;
 	pthread_mutex_init(&bc->bc_mtx, NULL);
@@ -355,9 +352,28 @@ blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 int
 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
+	struct blockif_elem *be;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
-	return (blockif_request(bc, breq, BOP_CANCEL));
+
+	pthread_mutex_lock(&bc->bc_mtx);
+	TAILQ_FOREACH(be, &bc->bc_inuseq, be_link) {
+		if (be->be_req == breq)
+			break;
+	}
+	if (be == NULL) {
+		pthread_mutex_unlock(&bc->bc_mtx);
+		return (EINVAL);
+	}
+
+	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
+	be->be_status = BST_FREE;
+	be->be_req = NULL;
+	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+	bc->bc_req_count--;
+	pthread_mutex_unlock(&bc->bc_mtx);
+
+	return (0);
 }
 
 int
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 214237d..42aa0b3 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
+#include <pthread_np.h>
 #include <inttypes.h>
 
 #include "bhyverun.h"
@@ -115,7 +116,8 @@ static FILE *dbg;
 struct ahci_ioreq {
 	struct blockif_req io_req;
 	struct ahci_port *io_pr;
-	STAILQ_ENTRY(ahci_ioreq) io_list;
+	STAILQ_ENTRY(ahci_ioreq) io_flist;
+	TAILQ_ENTRY(ahci_ioreq) io_blist;
 	uint8_t *cfis;
 	uint32_t len;
 	uint32_t done;
@@ -160,6 +162,7 @@ struct ahci_port {
 	struct ahci_ioreq *ioreq;
 	int ioqsz;
 	STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
+	TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
 };
 
 struct ahci_cmd_hdr {
@@ -360,6 +363,68 @@ ahci_write_reset_fis_d2h(struct ahci_port *p)
 }
 
 static void
+ahci_check_stopped(struct ahci_port *p)
+{
+	/*
+	 * If we are no longer processing the command list and nothing
+	 * is in-flight, clear the running bit.
+	 */
+	if (!(p->cmd & AHCI_P_CMD_ST)) {
+		if (p->pending == 0)
+			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
+	}
+}
+
+static void
+ahci_port_stop(struct ahci_port *p)
+{
+	struct ahci_ioreq *aior;
+	uint8_t *cfis;
+	int slot;
+	int ncq;
+	int error;
+
+	assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
+
+	TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
+		/*
+		 * Try to cancel the outstanding blockif request.
+		 */
+		error = blockif_cancel(p->bctx, &aior->io_req);
+		if (error != 0)
+			continue;
+
+		slot = aior->slot;
+		cfis = aior->cfis;
+		if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+		    cfis[2] == ATA_READ_FPDMA_QUEUED)
+			ncq = 1;
+
+		if (ncq)
+			p->sact &= ~(1 << slot);
+		else
+			p->ci &= ~(1 << slot);
+
+		/*
+		 * This command is now done.
+		 */
+		p->pending &= ~(1 << slot);
+
+		/*
+		 * Delete the blockif request from the busy list
+		 */
+		TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+		/*
+		 * Move the blockif request back to the free list
+		 */
+		STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+	}
+
+	ahci_check_stopped(p);
+}
+
+static void
 ahci_port_reset(struct ahci_port *pr)
 {
 	pr->sctl = 0;
@@ -492,7 +557,7 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
-	STAILQ_REMOVE_HEAD(&p->iofhd, io_list);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
@@ -503,15 +568,21 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
 	if (iovcnt > BLOCKIF_IOV_MAX) {
 		aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
 		iovcnt = BLOCKIF_IOV_MAX;
-		/*
-		 * Mark this command in-flight.
-		 */
-		p->pending |= 1 << slot;
 	} else
 		aior->prdtl = 0;
 	breq->br_iovcnt = iovcnt;
 
 	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Build up the iovec based on the prdt
 	 */
 	for (i = 0; i < iovcnt; i++) {
@@ -546,7 +617,7 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
-	STAILQ_REMOVE_HEAD(&p->iofhd, io_list);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = 0;
@@ -554,6 +625,16 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 	aior->prdtl = 0;
 	breq = &aior->io_req;
 
+	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
 	err = blockif_flush(p->bctx, breq);
 	assert(err == 0);
 }
@@ -961,7 +1042,7 @@ atapi_read(struct ahci_port *p, int slot, uint8_t *cfis,
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
-	STAILQ_REMOVE_HEAD(&p->iofhd, io_list);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
@@ -977,6 +1058,16 @@ atapi_read(struct ahci_port *p, int slot, uint8_t *cfis,
 	breq->br_iovcnt = iovcnt;
 
 	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Build up the iovec based on the prdt
 	 */
 	for (i = 0; i < iovcnt; i++) {
@@ -1415,9 +1506,14 @@ ata_ioreq_cb(struct blockif_req *br, int err)
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
+	 * Delete the blockif request from the busy list
+	 */
+	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Move the blockif request back to the free list
 	 */
-	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_list);
+	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (pending && !err) {
 		ahci_handle_dma(p, slot, cfis, aior->done,
@@ -1438,17 +1534,18 @@ ata_ioreq_cb(struct blockif_req *br, int err)
 			p->serr |= (1 << slot);
 	}
 
-	/*
-	 * This command is now complete.
-	 */
-	p->pending &= ~(1 << slot);
-
 	if (ncq) {
 		p->sact &= ~(1 << slot);
 		ahci_write_fis_sdb(p, slot, tfd);
 	} else
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 
+	/*
+	 * This command is now complete.
+	 */
+	p->pending &= ~(1 << slot);
+
+	ahci_check_stopped(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
@@ -1478,9 +1575,14 @@ atapi_ioreq_cb(struct blockif_req *br, int err)
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
+	 * Delete the blockif request from the busy list
+	 */
+	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Move the blockif request back to the free list
 	 */
-	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_list);
+	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (pending && !err) {
 		atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending);
@@ -1500,6 +1602,12 @@ atapi_ioreq_cb(struct blockif_req *br, int err)
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 
+	/*
+	 * This command is now complete.
+	 */
+	p->pending &= ~(1 << slot);
+
+	ahci_check_stopped(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
@@ -1526,8 +1634,10 @@ pci_ahci_ioreq_init(struct ahci_port *pr)
 		else
 			vr->io_req.br_callback = atapi_ioreq_cb;
 		vr->io_req.br_param = vr;
-		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_list);
+		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
 	}
+
+	TAILQ_INIT(&pr->iobhd);
 }
 
 static void
@@ -1565,9 +1675,7 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 		p->cmd = value;
 		
 		if (!(value & AHCI_P_CMD_ST)) {
-			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
-			p->ci = 0;
-			p->sact = 0;
+			ahci_port_stop(p);
 		} else {
 			uint64_t clb;
 
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 394b116..c66ad68 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -94,6 +94,8 @@ struct vtblk_config {
 struct virtio_blk_hdr {
 #define	VBH_OP_READ		0
 #define	VBH_OP_WRITE		1
+#define	VBH_OP_FLUSH		4
+#define	VBH_OP_FLUSH_OUT	5
 #define	VBH_OP_IDENT		8		
 #define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
 	uint32_t       	vbh_type;
@@ -217,6 +219,10 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
 		err = 0;
 		break;
+	case VBH_OP_FLUSH:
+	case VBH_OP_FLUSH_OUT:
+		err = fsync(sc->vbsc_fd);
+		break;
 	default:
 		err = -ENOSYS;
 		break;
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
index 0002da8..b939c1a 100644
--- a/usr.sbin/bhyve/task_switch.c
+++ b/usr.sbin/bhyve/task_switch.c
@@ -725,6 +725,21 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
 
 	/*
+	 * Calculate the %eip to store in the old TSS before modifying the
+	 * 'inst_length'.
+	 */
+	eip = vmexit->rip + vmexit->inst_length;
+
+	/*
+	 * Set the 'inst_length' to '0'.
+	 *
+	 * If an exception is triggered during emulation of the task switch
+	 * then the exception handler should return to the instruction that
+	 * caused the task switch as opposed to the subsequent instruction.
+	 */
+	vmexit->inst_length = 0;
+
+	/*
 	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
 	 * The following page table accesses are implicitly supervisor mode:
 	 * - accesses to GDT or LDT to load segment descriptors
@@ -839,7 +854,6 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	}
 
 	/* Save processor state in old TSS */
-	eip = vmexit->rip + vmexit->inst_length;
 	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
 
 	/*
@@ -870,7 +884,7 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	 * the saved instruction pointer will belong to the new task.
 	 */
 	vmexit->rip = newtss.tss_eip;
-	vmexit->inst_length = 0;
+	assert(vmexit->inst_length == 0);
 
 	/* Load processor state from new TSS */
 	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index 9581fb0..19c0d47 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
 #include <stdio.h>
 #include <stdint.h>
 #include <pthread.h>
+#include <pthread_np.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
@@ -89,6 +90,9 @@ vi_reset_dev(struct virtio_softc *vs)
 	struct vqueue_info *vq;
 	int i, nvq;
 
+	if (vs->vs_mtx)
+		assert(pthread_mutex_isowned_np(vs->vs_mtx));
+
 	nvq = vs->vs_vc->vc_nvq;
 	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
 		vq->vq_flags = 0;
@@ -99,11 +103,9 @@ vi_reset_dev(struct virtio_softc *vs)
 	vs->vs_negotiated_caps = 0;
 	vs->vs_curq = 0;
 	/* vs->vs_status = 0; -- redundant */
-	VS_LOCK(vs);
 	if (vs->vs_isr)
 		pci_lintr_deassert(vs->vs_pi);
 	vs->vs_isr = 0;
-	VS_UNLOCK(vs);
 	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
 }
 
@@ -137,7 +139,9 @@ vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
 
 	if (use_msix) {
 		vs->vs_flags |= VIRTIO_USE_MSIX;
+		VS_LOCK(vs);
 		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
+		VS_UNLOCK(vs);
 		nvec = vs->vs_vc->vc_nvq + 1;
 		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
 			return (1);
diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c
index 63522bf..1ed1ea1 100644
--- a/usr.sbin/bhyve/xmsr.c
+++ b/usr.sbin/bhyve/xmsr.c
@@ -31,33 +31,91 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
+#include <machine/cpufunc.h>
 #include <machine/vmm.h>
+#include <machine/specialreg.h>
+
 #include <vmmapi.h>
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "xmsr.h"
 
+static int cpu_vendor_intel, cpu_vendor_amd;
+
 int
 emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
 {
 
-	switch (code) {
-	case 0xd04:			/* Sandy Bridge uncore PMC MSRs */
-	case 0xc24:
-		return (0);
-	case 0x79:
-		return (0);		/* IA32_BIOS_UPDT_TRIG MSR */
-	default:
-		break;
+	if (cpu_vendor_intel) {
+		switch (code) {
+		case 0xd04:		/* Sandy Bridge uncore PMCs */
+		case 0xc24:
+			return (0);
+		case MSR_BIOS_UPDT_TRIG:
+			return (0);
+		case MSR_BIOS_SIGN:
+			return (0);
+		default:
+			break;
+		}
 	}
 	return (-1);
 }
 
 int
-emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val)
+emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
 {
+	int error = 0;
 
-	return (-1);
+	if (cpu_vendor_intel) {
+		switch (num) {
+		case MSR_BIOS_SIGN:
+		case MSR_IA32_PLATFORM_ID:
+		case MSR_PKG_ENERGY_STATUS:
+		case MSR_PP0_ENERGY_STATUS:
+		case MSR_PP1_ENERGY_STATUS:
+		case MSR_DRAM_ENERGY_STATUS:
+			*val = 0;
+			break;
+		case MSR_RAPL_POWER_UNIT:
+			/*
+			 * Use the default value documented in section
+			 * "RAPL Interfaces" in Intel SDM vol3.
+			 */
+			*val = 0x000a1003;
+			break;
+		default:
+			error = -1;
+			break;
+		}
+	}
+	return (error);
+}
+
+int
+init_msr(void)
+{
+	int error;
+	u_int regs[4];
+	char cpu_vendor[13];
+
+	do_cpuid(0, regs);
+	((u_int *)&cpu_vendor)[0] = regs[1];
+	((u_int *)&cpu_vendor)[1] = regs[3];
+	((u_int *)&cpu_vendor)[2] = regs[2];
+	cpu_vendor[12] = '\0';
+
+	error = 0;
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+		cpu_vendor_amd = 1;
+	} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+		cpu_vendor_intel = 1;
+	} else {
+		fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
+		error = -1;
+	}
+	return (error);
 }
diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h
index b097cf8..bcf65b7 100644
--- a/usr.sbin/bhyve/xmsr.h
+++ b/usr.sbin/bhyve/xmsr.h
@@ -29,6 +29,7 @@
 #ifndef	_XMSR_H_
 #define	_XMSR_H_
 
+int init_msr(void);
 int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
 int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val);
 
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index b6006b7..f5e50d3 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -309,7 +309,7 @@ dump_vmcs_msr_bitmap(int vcpu, u_long addr)
 	if (fd < 0)
 		goto done;
 
-	bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+	bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, addr);
 	if (bitmap == MAP_FAILED)
 		goto done;
author	neel <neel@FreeBSD.org>	2014-12-28 21:27:13 +0000
committer	neel <neel@FreeBSD.org>	2014-12-28 21:27:13 +0000
commit	88c1adb41738babfd568dce3befb4b0b1b9fd799 (patch)
tree	3935974ca9583376cc712bb4162a13e099846c96
parent	585f5c8ddaef5e9b9ba675ab11a5e8481aa0c425 (diff)
download	FreeBSD-src-88c1adb41738babfd568dce3befb4b0b1b9fd799.zip FreeBSD-src-88c1adb41738babfd568dce3befb4b0b1b9fd799.tar.gz