MFC r276428:

Replace bhyve's minimal RTC emulation with a fully featured one in vmm.ko. MFC r276432: Initialize all fields of 'struct vm_exception exception' before passing it to vm_inject_exception(). MFC r276763: Clear blocking due to STI or MOV SS in the hypervisor when an instruction is emulated or when the vcpu incurs an exception. MFC r277149: Clean up usage of 'struct vm_exception' to only to communicate information from userspace to vmm.ko when injecting an exception. MFC r277168: Fix typo (missing comma). MFC r277309: Make the error message explicit instead of just printing the usage if the virtual machine name is not specified. MFC r277310: Simplify instruction restart logic in bhyve. MFC r277359: Fix a bug in libvmmapi 'vm_copy_setup()' where it would return success even if the 'gpa' was in the guest MMIO region. MFC r277360: MOVS instruction emulation. MFC r277626: Add macro to identify AVIC capability (advanced virtual interrupt controller) in AMD processors. MFC r279220: Don't close a block context if it couldn't be opened avoiding a null deref. MFC r279225: Add "-u" option to bhyve(8) to indicate that the RTC should maintain UTC time. MFC r279227: Emulate MSR 0xC0011024 when running on AMD processors. MFC r279228: Always emulate MSR_PAT on Intel processors and don't rely on PAT save/restore capability of VT-x. This lets bhyve run nested in older VMware versions that don't support the PAT save/restore capability. MFC r279540: Fix warnings/errors when building vmm.ko with gcc.
author: neel <neel@FreeBSD.org> 2015-06-27 22:48:22 +0000
committer: neel <neel@FreeBSD.org> 2015-06-27 22:48:22 +0000
commit: 115742fae3f7a7c52d6d5f4894f37e68dff4fd5c (patch)
tree: cb845c4cb2d3a3b67b3e1134742c3c5b250ae954 /sys/amd64/vmm
parent: 02efaba1d135756ed65855bdc99e7d83f46cc4a2 (diff)
download: FreeBSD-src-115742fae3f7a7c52d6d5f4894f37e68dff4fd5c.zip
FreeBSD-src-115742fae3f7a7c52d6d5f4894f37e68dff4fd5c.tar.gz
14 files changed, 1556 insertions, 182 deletions
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index ab47041..88a846d 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -80,6 +80,7 @@ SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
 #define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
 #define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
 #define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
+#define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
 
 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
 				VMCB_CACHE_IOPM		|	\
@@ -554,6 +555,7 @@ svm_vminit(struct vm *vm, pmap_t pmap)
 	pml4_pa = svm_sc->nptp;
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu = svm_get_vcpu(svm_sc, i);
+		vcpu->nextrip = ~0;
 		vcpu->lastcpu = NOCPU;
 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
@@ -1200,7 +1202,6 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct svm_regctx *ctx;
-	struct vm_exception exception;
 	uint64_t code, info1, info2, val;
 	uint32_t eax, ecx, edx;
 	int error, errcode_valid, handled, idtvec, reflect;
@@ -1314,6 +1315,7 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 			/* fallthru */
 		default:
 			errcode_valid = 0;
+			info1 = 0;
 			break;
 		}
 		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
@@ -1322,14 +1324,10 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 
 		if (reflect) {
 			/* Reflect the exception back into the guest */
-			exception.vector = idtvec;
-			exception.error_code_valid = errcode_valid;
-			exception.error_code = errcode_valid ? info1 : 0;
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
-			    "%d/%#x into the guest", exception.vector,
-			    exception.error_code);
-			error = vm_inject_exception(svm_sc->vm, vcpu,
-			    &exception);
+			    "%d/%#x into the guest", idtvec, (int)info1);
+			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
+			    errcode_valid, info1, 0);
 			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 			    __func__, error));
 		}
@@ -1476,15 +1474,24 @@ svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
+	struct svm_vcpu *vcpustate;
 	uint8_t v_tpr;
 	int vector, need_intr_window, pending_apic_vector;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
+	vcpustate = svm_get_vcpu(sc, vcpu);
 
 	need_intr_window = 0;
 	pending_apic_vector = 0;
 
+	if (vcpustate->nextrip != state->rip) {
+		ctrl->intr_shadow = 0;
+		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
+		    "cleared due to rip change: %#lx/%#lx",
+		    vcpustate->nextrip, state->rip);
+	}
+
 	/*
 	 * Inject pending events or exceptions for this vcpu.
 	 *
@@ -1634,7 +1641,7 @@ done:
 	 * VMRUN.
 	 */
 	v_tpr = vlapic_get_cr8(vlapic);
-	KASSERT(v_tpr >= 0 && v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
+	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
 	if (ctrl->v_tpr != v_tpr) {
 		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
 		    ctrl->v_tpr, v_tpr);
@@ -1801,14 +1808,14 @@ static __inline void
 disable_gintr(void)
 {
 
-        __asm __volatile("clgi" : : :);
+	__asm __volatile("clgi");
 }
 
 static __inline void
 enable_gintr(void)
 {
 
-        __asm __volatile("stgi" : : :);
+        __asm __volatile("stgi");
 }
 
 /*
@@ -1955,6 +1962,9 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		/* #VMEXIT disables interrupts so re-enable them here. */ 
 		enable_gintr();
 
+		/* Update 'nextrip' */
+		vcpustate->nextrip = state->rip;
+
 		/* Handle #VMEXIT and if required return to user space. */
 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
 	} while (handled);
diff --git a/sys/amd64/vmm/amd/svm_softc.h b/sys/amd64/vmm/amd/svm_softc.h
index a5bb57c..de0c3f7 100644
--- a/sys/amd64/vmm/amd/svm_softc.h
+++ b/sys/amd64/vmm/amd/svm_softc.h
@@ -45,6 +45,7 @@ struct svm_vcpu {
 	struct vmcb	vmcb;	 /* hardware saved vcpu context */
 	struct svm_regctx swctx; /* software saved vcpu context */
 	uint64_t	vmcb_pa; /* VMCB physical address */
+	uint64_t	nextrip; /* next instruction to be executed by guest */
         int		lastcpu; /* host cpu that the vcpu last ran on */
 	uint32_t	dirty;	 /* state cache bits that must be cleared */
 	long		eptgen;	 /* pmap->pm_eptgen when the vcpu last ran */
diff --git a/sys/amd64/vmm/amd/svm_support.S b/sys/amd64/vmm/amd/svm_support.S
index 72327bd..b363101 100644
--- a/sys/amd64/vmm/amd/svm_support.S
+++ b/sys/amd64/vmm/amd/svm_support.S
@@ -22,6 +22,8 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
  */
 #include <machine/asmacros.h>
 
@@ -35,6 +37,10 @@
 #define	VENTER  push %rbp ; mov %rsp,%rbp
 #define	VLEAVE  pop %rbp
 
+#define	VMLOAD	.byte 0x0f, 0x01, 0xda
+#define	VMRUN	.byte 0x0f, 0x01, 0xd8
+#define	VMSAVE	.byte 0x0f, 0x01, 0xdb
+
 /*
  * svm_launch(uint64_t vmcb, struct svm_regctx *gctx)
  * %rdi: physical address of VMCB
@@ -79,9 +85,9 @@ ENTRY(svm_launch)
 	movq SCTX_RDI(%rsi), %rdi
 	movq SCTX_RSI(%rsi), %rsi	/* %rsi must be restored last */
 
-	vmload %rax
-	vmrun %rax
-	vmsave %rax
+	VMLOAD
+	VMRUN
+	VMSAVE
 
 	pop %rax		/* pop guest context pointer from the stack */
 
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index ae4d9db..5962526 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -342,18 +342,6 @@ vmcs_init(struct vmcs *vmcs)
 	 */
 	VMPTRLD(vmcs);
 
-	/* Initialize guest IA32_PAT MSR with the default value */
-	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
-	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
-	      PAT_VALUE(2, PAT_UNCACHED)	|
-	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
-	      PAT_VALUE(4, PAT_WRITE_BACK)	|
-	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
-	      PAT_VALUE(6, PAT_UNCACHED)	|
-	      PAT_VALUE(7, PAT_UNCACHEABLE);
-	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
-		goto done;
-
 	/* Host state */
 
 	/* Initialize host IA32_PAT MSR */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index c3dd04e..b81e48b 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -100,13 +100,11 @@ __FBSDID("$FreeBSD$");
 	(VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER			|			\
-	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
-	VM_EXIT_SAVE_PAT			|			\
-	VM_EXIT_LOAD_PAT)
+	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
 
 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
 
-#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT)
+#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER)
 
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
@@ -859,10 +857,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
-	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
-	 * and entry respectively. It is also restored from the host VMCS
-	 * area on a VM exit.
-	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as that
 	 * will impact the host TSC.
 	 * XXX Writes would be implemented with a wrmsr trap, and
@@ -874,7 +868,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
-	    guest_msr_rw(vmx, MSR_PAT) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
@@ -941,6 +934,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
+		vmx->state[i].nextrip = ~0;
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
@@ -1169,12 +1163,24 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 }
 
 static void
-vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
+vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
+    uint64_t guestrip)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
+	if (vmx->state[vcpu].nextrip != guestrip) {
+		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+		if (gi & HWINTR_BLOCKING) {
+			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
+			    "cleared due to rip change: %#lx/%#lx",
+			    vmx->state[vcpu].nextrip, guestrip);
+			gi &= ~HWINTR_BLOCKING;
+			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+		}
+	}
+
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
@@ -1771,7 +1777,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
-	
+
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
@@ -2060,12 +2066,11 @@ emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
-	int error, handled, in;
+	int error, errcode, errcode_valid, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
-	struct vm_exception vmexc;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, intr_vec, reason;
 	uint64_t exitintinfo, qual, gpa;
@@ -2250,6 +2255,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		vmexit->inst_length = 0;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
@@ -2376,15 +2382,15 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 
 		/* Reflect all other exceptions back into the guest */
-		bzero(&vmexc, sizeof(struct vm_exception));
-		vmexc.vector = intr_vec;
+		errcode_valid = errcode = 0;
 		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
-			vmexc.error_code_valid = 1;
-			vmexc.error_code = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
+			errcode_valid = 1;
+			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 		}
 		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
-		    "the guest", vmexc.vector, vmexc.error_code);
-		error = vm_inject_exception(vmx->vm, vcpu, &vmexc);
+		    "the guest", intr_vec, errcode);
+		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
+		    errcode_valid, errcode, 0);
 		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 		    __func__, error));
 		return (1);
@@ -2399,6 +2405,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		if (vm_mem_allocated(vmx->vm, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->inst_length = 0;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
@@ -2540,7 +2547,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 }
 
 static int
-vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
+vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
     void *rendezvous_cookie, void *suspend_cookie)
 {
 	int rc, handled, launched;
@@ -2550,7 +2557,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
-	uint64_t rip;
 	uint32_t exit_reason;
 
 	vmx = arg;
@@ -2578,11 +2584,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
-	vmcs_write(VMCS_GUEST_RIP, startrip);
+	vmcs_write(VMCS_GUEST_RIP, rip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
-		handled = UNHANDLED;
+		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
+		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 
+		handled = UNHANDLED;
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
@@ -2602,7 +2610,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
-		vmx_inject_interrupts(vmx, vcpu, vlapic);
+		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
@@ -2611,20 +2619,20 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
-			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
+			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
 			enable_intr();
-			vm_exit_rendezvous(vmx->vm, vcpu, vmcs_guest_rip());
+			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
-			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
-			vmx_astpending_trace(vmx, vcpu, vmexit->rip);
+			vm_exit_astpending(vmx->vm, vcpu, rip);
+			vmx_astpending_trace(vmx, vcpu, rip);
 			handled = HANDLED;
 			break;
 		}
@@ -2638,6 +2646,9 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
+		/* Update 'nextrip' */
+		vmx->state[vcpu].nextrip = rip;
+
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
@@ -2648,6 +2659,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+		rip = vmexit->rip;
 	} while (handled);
 
 	/*
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index 2124554..bc48861 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -78,6 +78,7 @@ struct vmxcap {
 };
 
 struct vmxstate {
+	uint64_t nextrip;	/* next instruction to be executed by guest */
 	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
 	uint16_t vpid;
 };
@@ -102,6 +103,7 @@ enum {
 	IDX_MSR_STAR,
 	IDX_MSR_SF_MASK,
 	IDX_MSR_KGSBASE,
+	IDX_MSR_PAT,
 	GUEST_MSR_NUM		/* must be the last enumeration */
 };
 
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index f6bbf2a..e517778 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -230,6 +230,25 @@ westmere_cpu(void)
 	return (false);
 }
 
+static bool
+pat_valid(uint64_t val)
+{
+	int i, pa;
+
+	/*
+	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
+	 *
+	 * Extract PA0 through PA7 and validate that each one encodes a
+	 * valid memory type.
+	 */
+	for (i = 0; i < 8; i++) {
+		pa = (val >> (i * 8)) & 0xff;
+		if (pa == 2 || pa == 3 || pa >= 8)
+			return (false);
+	}
+	return (true);
+}
+
 void
 vmx_msr_init(void)
 {
@@ -302,6 +321,10 @@ vmx_msr_init(void)
 void
 vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
 {
+	uint64_t *guest_msrs;
+
+	guest_msrs = vmx->guest_msrs[vcpuid];
+
 	/*
 	 * The permissions bitmap is shared between all vcpus so initialize it
 	 * once when initializing the vBSP.
@@ -313,6 +336,19 @@ vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
 		guest_msr_rw(vmx, MSR_SF_MASK);
 		guest_msr_rw(vmx, MSR_KGSBASE);
 	}
+
+	/*
+	 * Initialize guest IA32_PAT MSR with default value after reset.
+	 */
+	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
+	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(2, PAT_UNCACHED)		|
+	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	    PAT_VALUE(4, PAT_WRITE_BACK)	|
+	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(6, PAT_UNCACHED)		|
+	    PAT_VALUE(7, PAT_UNCACHEABLE);
+
 	return;
 }
 
@@ -353,7 +389,11 @@ vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
 int
 vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 {
-	int error = 0;
+	const uint64_t *guest_msrs;
+	int error;
+
+	guest_msrs = vmx->guest_msrs[vcpuid];
+	error = 0;
 
 	switch (num) {
 	case MSR_IA32_MISC_ENABLE:
@@ -366,6 +406,9 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 	case MSR_TURBO_RATIO_LIMIT1:
 		*val = turbo_ratio_limit;
 		break;
+	case MSR_PAT:
+		*val = guest_msrs[IDX_MSR_PAT];
+		break;
 	default:
 		error = EINVAL;
 		break;
@@ -376,10 +419,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 int
 vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
+	uint64_t *guest_msrs;
 	uint64_t changed;
 	int error;
 	
+	guest_msrs = vmx->guest_msrs[vcpuid];
 	error = 0;
+
 	switch (num) {
 	case MSR_IA32_MISC_ENABLE:
 		changed = val ^ misc_enable;
@@ -401,6 +447,12 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 			error = EINVAL;
 
 		break;
+	case MSR_PAT:
+		if (pat_valid(val))
+			guest_msrs[IDX_MSR_PAT] = val;
+		else
+			vm_inject_gp(vmx->vm, vcpuid);
+		break;
 	default:
 		error = EINVAL;
 		break;
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index 46e5ca7..a4c96cd 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -104,7 +104,6 @@ vhpet_capabilities(void)
 	uint64_t cap = 0;
 
 	cap |= 0x8086 << 16;			/* vendor id */
-	cap |= HPET_CAP_LEG_RT;			/* legacy routing capable */
 	cap |= (VHPET_NUM_TIMERS - 1) << 8;	/* number of timers */
 	cap |= 1;				/* revision */
 	cap &= ~HPET_CAP_COUNT_SIZE;		/* 32-bit timer */
@@ -127,15 +126,6 @@ vhpet_timer_msi_enabled(struct vhpet *vhpet, int n)
 {
 	const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN;
 
-	/*
-	 * LegacyReplacement Route configuration takes precedence over MSI
-	 * for timers 0 and 1.
-	 */
-	if (n == 0 || n == 1) {
-		if (vhpet->config & HPET_CNF_LEG_RT)
-			return (false);
-	}
-
 	if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable)
 		return (true);
 	else
@@ -152,41 +142,9 @@ vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n)
 	if (vhpet_timer_msi_enabled(vhpet, n))
 		return (0);
 
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		/*
-		 * In "legacy routing" timers 0 and 1 are connected to
-		 * ioapic pins 2 and 8 respectively.
-		 */
-		switch (n) {
-		case 0:
-			return (2);
-		case 1:
-			return (8);
-		}
-	}
-
 	return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9);
 }
 
-static __inline int
-vhpet_timer_atpic_pin(struct vhpet *vhpet, int n)
-{
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		/*
-		 * In "legacy routing" timers 0 and 1 are connected to
-		 * 8259 master pin 0 and slave pin 0 respectively.
-		 */
-		switch (n) {
-		case 0:
-			return (0);
-		case 1:
-			return (8);
-		}
-	}
-
-	return (-1);
-}
-
 static uint32_t
 vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 {
@@ -216,17 +174,12 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 static void
 vhpet_timer_clear_isr(struct vhpet *vhpet, int n)
 {
-	int pin, legacy_pin;
+	int pin;
 
 	if (vhpet->isr & (1 << n)) {
 		pin = vhpet_timer_ioapic_pin(vhpet, n);
 		KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n));
 		vioapic_deassert_irq(vhpet->vm, pin);
-
-		legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
-		if (legacy_pin != -1)
-			vatpic_deassert_irq(vhpet->vm, legacy_pin);
-
 		vhpet->isr &= ~(1 << n);
 	}
 }
@@ -252,12 +205,6 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 	KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: "
 	    "timer %d is using MSI", n));
 
-	/* The legacy replacement interrupts are always edge triggered */
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		if (n == 0 || n == 1)
-			return (true);
-	}
-
 	if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0)
 		return (true);
 	else
@@ -267,7 +214,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 static void
 vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 {
-	int pin, legacy_pin;
+	int pin;
 
 	/* If interrupts are not enabled for this timer then just return. */
 	if (!vhpet_timer_interrupt_enabled(vhpet, n))
@@ -293,17 +240,11 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 		return;
 	}
 
-	legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
-
 	if (vhpet_timer_edge_trig(vhpet, n)) {
 		vioapic_pulse_irq(vhpet->vm, pin);
-		if (legacy_pin != -1)
-			vatpic_pulse_irq(vhpet->vm, legacy_pin);
 	} else {
 		vhpet->isr |= 1 << n;
 		vioapic_assert_irq(vhpet->vm, pin);
-		if (legacy_pin != -1)
-			vatpic_assert_irq(vhpet->vm, legacy_pin);
 	}
 }
 
@@ -579,6 +520,13 @@ vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size,
 		counter = vhpet_counter(vhpet, nowptr);
 		oldval = vhpet->config;
 		update_register(&vhpet->config, data, mask);
+
+		/*
+		 * LegacyReplacement Routing is not supported so clear the
+		 * bit explicitly.
+		 */
+		vhpet->config &= ~HPET_CNF_LEG_RT;
+
 		if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) {
 			if (vhpet_counter_enabled(vhpet)) {
 				vhpet_start_counting(vhpet);
diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c
new file mode 100644
index 0000000..d5e93dc
--- /dev/null
+++ b/sys/amd64/vmm/io/vrtc.c
@@ -0,0 +1,952 @@
+/*-
+ * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+
+#include <machine/vmm.h>
+
+#include <isa/rtc.h>
+
+#include "vmm_ktr.h"
+#include "vatpic.h"
+#include "vioapic.h"
+#include "vrtc.h"
+
+/* Register layout of the RTC */
+struct rtcdev {
+	uint8_t	sec;
+	uint8_t	alarm_sec;
+	uint8_t	min;
+	uint8_t	alarm_min;
+	uint8_t	hour;
+	uint8_t	alarm_hour;
+	uint8_t	day_of_week;
+	uint8_t	day_of_month;
+	uint8_t	month;
+	uint8_t	year;
+	uint8_t	reg_a;
+	uint8_t	reg_b;
+	uint8_t	reg_c;
+	uint8_t	reg_d;
+	uint8_t	nvram[128 - 14];
+} __packed;
+CTASSERT(sizeof(struct rtcdev) == 128);
+
+struct vrtc {
+	struct vm	*vm;
+	struct mtx	mtx;
+	struct callout	callout;
+	u_int		addr;		/* RTC register to read or write */
+	sbintime_t	base_uptime;
+	time_t		base_rtctime;
+	struct rtcdev	rtcdev;
+};
+
+#define	VRTC_LOCK(vrtc)		mtx_lock(&((vrtc)->mtx))
+#define	VRTC_UNLOCK(vrtc)	mtx_unlock(&((vrtc)->mtx))
+#define	VRTC_LOCKED(vrtc)	mtx_owned(&((vrtc)->mtx))
+
+/*
+ * RTC time is considered "broken" if:
+ * - RTC updates are halted by the guest
+ * - RTC date/time fields have invalid values
+ */
+#define	VRTC_BROKEN_TIME	((time_t)-1)
+
+#define	RTC_IRQ			8
+#define	RTCSB_BIN		0x04
+#define	RTCSB_ALL_INTRS		(RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR)
+#define	rtc_halted(vrtc)	((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0)
+#define	aintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0)
+#define	pintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0)
+#define	uintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0)
+
+static void vrtc_callout_handler(void *arg);
+static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval);
+
+static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc");
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL);
+
+static int rtc_flag_broken_time = 1;
+SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN,
+    &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected");
+
+static __inline bool
+divider_enabled(int reg_a)
+{
+	/*
+	 * The RTC is counting only when dividers are not held in reset.
+	 */
+	return ((reg_a & 0x70) == 0x20);
+}
+
+static __inline bool
+update_enabled(struct vrtc *vrtc)
+{
+	/*
+	 * RTC date/time can be updated only if:
+	 * - divider is not held in reset
+	 * - guest has not disabled updates
+	 * - the date/time fields have valid contents
+	 */
+	if (!divider_enabled(vrtc->rtcdev.reg_a))
+		return (false);
+
+	if (rtc_halted(vrtc))
+		return (false);
+
+	if (vrtc->base_rtctime == VRTC_BROKEN_TIME)
+		return (false);
+
+	return (true);
+}
+
+static time_t
+vrtc_curtime(struct vrtc *vrtc)
+{
+	sbintime_t now, delta;
+	time_t t;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	t = vrtc->base_rtctime;
+	if (update_enabled(vrtc)) {
+		now = sbinuptime();
+		delta = now - vrtc->base_uptime;
+		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
+		    "%#lx to %#lx", vrtc->base_uptime, now));
+		t += delta / SBT_1S;
+	}
+	return (t);
+}
+
+static __inline uint8_t
+rtcset(struct rtcdev *rtc, int val)
+{
+
+	KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d",
+	    __func__, val));
+
+	return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]);
+}
+
+static void
+secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update)
+{
+	struct clocktime ct;
+	struct timespec ts;
+	struct rtcdev *rtc;
+	int hour;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	if (rtctime < 0) {
+		KASSERT(rtctime == VRTC_BROKEN_TIME,
+		    ("%s: invalid vrtc time %#lx", __func__, rtctime));
+		return;
+	}
+
+	/*
+	 * If the RTC is halted then the guest has "ownership" of the
+	 * date/time fields. Don't update the RTC date/time fields in
+	 * this case (unless forced).
+	 */
+	if (rtc_halted(vrtc) && !force_update)
+		return;
+
+	ts.tv_sec = rtctime;
+	ts.tv_nsec = 0;
+	clock_ts_to_ct(&ts, &ct);
+
+	KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d",
+	    ct.sec));
+	KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d",
+	    ct.min));
+	KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d",
+	    ct.hour));
+	KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d",
+	    ct.dow));
+	KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d",
+	    ct.day));
+	KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d",
+	    ct.mon));
+	KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d",
+	    ct.year));
+
+	rtc = &vrtc->rtcdev;
+	rtc->sec = rtcset(rtc, ct.sec);
+	rtc->min = rtcset(rtc, ct.min);
+
+	hour = ct.hour;
+	if ((rtc->reg_b & RTCSB_24HR) == 0)
+		hour = (hour % 12) + 1;	    /* convert to a 12-hour format */
+
+	rtc->hour = rtcset(rtc, hour);
+
+	if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12)
+		rtc->hour |= 0x80;	    /* set MSB to indicate PM */
+
+	rtc->day_of_week = rtcset(rtc, ct.dow + 1);
+	rtc->day_of_month = rtcset(rtc, ct.day);
+	rtc->month = rtcset(rtc, ct.mon);
+	rtc->year = rtcset(rtc, ct.year % 100);
+}
+
+static int
+rtcget(struct rtcdev *rtc, int val, int *retval)
+{
+	uint8_t upper, lower;
+
+	if (rtc->reg_b & RTCSB_BIN) {
+		*retval = val;
+		return (0);
+	}
+
+	lower = val & 0xf;
+	upper = (val >> 4) & 0xf;
+
+	if (lower > 9 || upper > 9)
+		return (-1);
+
+	*retval = upper * 10 + lower;
+	return (0);
+}
+
+static time_t
+rtc_to_secs(struct vrtc *vrtc)
+{
+	struct clocktime ct;
+	struct timespec ts;
+	struct rtcdev *rtc;
+	struct vm *vm;
+	int error, hour, pm, year;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	vm = vrtc->vm;
+	rtc = &vrtc->rtcdev;
+
+	bzero(&ct, sizeof(struct clocktime));
+
+	error = rtcget(rtc, rtc->sec, &ct.sec);
+	if (error || ct.sec < 0 || ct.sec > 59) {
+		VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec);
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->min, &ct.min);
+	if (error || ct.min < 0 || ct.min > 59) {
+		VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min);
+		goto fail;
+	}
+
+	pm = 0;
+	hour = rtc->hour;
+	if ((rtc->reg_b & RTCSB_24HR) == 0) {
+		if (hour & 0x80) {
+			hour &= ~0x80;
+			pm = 1;
+		}
+	}
+	error = rtcget(rtc, hour, &ct.hour);
+	if ((rtc->reg_b & RTCSB_24HR) == 0) {
+		ct.hour -= 1;
+		if (pm)
+			ct.hour += 12;
+	}
+
+	if (error || ct.hour < 0 || ct.hour > 23) {
+		VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour);
+		goto fail;
+	}
+
+	/*
+	 * Ignore 'rtc->dow' because some guests like Linux don't bother
+	 * setting it at all while others like OpenBSD/i386 set it incorrectly. 
+	 *
+	 * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it.
+	 */
+	ct.dow = -1;
+
+	error = rtcget(rtc, rtc->day_of_month, &ct.day);
+	if (error || ct.day < 1 || ct.day > 31) {
+		VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month,
+		    ct.day);
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->month, &ct.mon);
+	if (error || ct.mon < 1 || ct.mon > 12) {
+		VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon);
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->year, &year);
+	if (error || year < 0 || year > 99) {
+		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
+		goto fail;
+	}
+	if (year >= 70)
+		ct.year = 1900 + year;
+	else
+		ct.year = 2000 + year;
+
+	error = clock_ct_to_ts(&ct, &ts);
+	if (error || ts.tv_sec < 0) {
+		VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d",
+		    ct.year, ct.mon, ct.day);
+		VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d",
+		    ct.hour, ct.min, ct.sec);
+		goto fail;
+	}
+	return (ts.tv_sec);		/* success */
+fail:
+	return (VRTC_BROKEN_TIME);	/* failure */
+}
+
+static int
+vrtc_time_update(struct vrtc *vrtc, time_t newtime)
+{
+	struct rtcdev *rtc;
+	time_t oldtime;
+	uint8_t alarm_sec, alarm_min, alarm_hour;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	alarm_sec = rtc->alarm_sec;
+	alarm_min = rtc->alarm_min;
+	alarm_hour = rtc->alarm_hour;
+
+	oldtime = vrtc->base_rtctime;
+	VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx",
+	    oldtime, newtime);
+
+	if (newtime == oldtime)
+		return (0);
+
+	/*
+	 * If 'newtime' indicates that RTC updates are disabled then just
+	 * record that and return. There is no need to do alarm interrupt
+	 * processing or update 'base_uptime' in this case.
+	 */
+	if (newtime == VRTC_BROKEN_TIME) {
+		vrtc->base_rtctime = VRTC_BROKEN_TIME;
+		return (0);
+	}
+
+	/*
+	 * Return an error if RTC updates are halted by the guest.
+	 */
+	if (rtc_halted(vrtc)) {
+		VM_CTR0(vrtc->vm, "RTC update halted by guest");
+		return (EBUSY);
+	}
+
+	do {
+		/*
+		 * If the alarm interrupt is enabled and 'oldtime' is valid
+		 * then visit all the seconds between 'oldtime' and 'newtime'
+		 * to check for the alarm condition.
+		 *
+		 * Otherwise move the RTC time forward directly to 'newtime'.
+		 */
+		if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME)
+			vrtc->base_rtctime++;
+		else
+			vrtc->base_rtctime = newtime;
+
+		if (aintr_enabled(vrtc)) {
+			/*
+			 * Update the RTC date/time fields before checking
+			 * if the alarm conditions are satisfied.
+			 */
+			secs_to_rtc(vrtc->base_rtctime, vrtc, 0);
+
+			if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) &&
+			    (alarm_min >= 0xC0 || alarm_min == rtc->min) &&
+			    (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) {
+				vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM);
+			}
+		}
+	} while (vrtc->base_rtctime != newtime);
+
+	if (uintr_enabled(vrtc))
+		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
+
+	vrtc->base_uptime = sbinuptime();
+
+	return (0);
+}
+
+static sbintime_t
+vrtc_freq(struct vrtc *vrtc)
+{
+	int ratesel;
+
+	static sbintime_t pf[16] = {
+		0,
+		SBT_1S / 256,
+		SBT_1S / 128,
+		SBT_1S / 8192,
+		SBT_1S / 4096,
+		SBT_1S / 2048,
+		SBT_1S / 1024,
+		SBT_1S / 512,
+		SBT_1S / 256,
+		SBT_1S / 128,
+		SBT_1S / 64,
+		SBT_1S / 32,
+		SBT_1S / 16,
+		SBT_1S / 8,
+		SBT_1S / 4,
+		SBT_1S / 2,
+	};
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	/*
+	 * If both periodic and alarm interrupts are enabled then use the
+	 * periodic frequency to drive the callout. The minimum periodic
+	 * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so
+	 * piggyback the alarm on top of it. The same argument applies to
+	 * the update interrupt.
+	 */
+	if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) {
+		ratesel = vrtc->rtcdev.reg_a & 0xf;
+		return (pf[ratesel]);
+	} else if (aintr_enabled(vrtc) && update_enabled(vrtc)) {
+		return (SBT_1S);
+	} else if (uintr_enabled(vrtc) && update_enabled(vrtc)) {
+		return (SBT_1S);
+	} else {
+		return (0);
+	}
+}
+
+static void
+vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt)
+{
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	if (freqsbt == 0) {
+		if (callout_active(&vrtc->callout)) {
+			VM_CTR0(vrtc->vm, "RTC callout stopped");
+			callout_stop(&vrtc->callout);
+		}
+		return;
+	}
+	VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt);
+	callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler,
+	    vrtc, 0);
+}
+
+static void
+vrtc_callout_handler(void *arg)
+{
+	struct vrtc *vrtc = arg;
+	sbintime_t freqsbt;
+	time_t rtctime;
+	int error;
+
+	VM_CTR0(vrtc->vm, "vrtc callout fired");
+
+	VRTC_LOCK(vrtc);
+	if (callout_pending(&vrtc->callout))	/* callout was reset */
+		goto done;
+
+	if (!callout_active(&vrtc->callout))	/* callout was stopped */
+		goto done;
+
+	callout_deactivate(&vrtc->callout);
+
+	KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0,
+	    ("gratuitous vrtc callout"));
+
+	if (pintr_enabled(vrtc))
+		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
+
+	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
+		rtctime = vrtc_curtime(vrtc);
+		error = vrtc_time_update(vrtc, rtctime);
+		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
+		    __func__, error));
+	}
+
+	freqsbt = vrtc_freq(vrtc);
+	KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__));
+	vrtc_callout_reset(vrtc, freqsbt);
+done:
+	VRTC_UNLOCK(vrtc);
+}
+
+static __inline void
+vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq)
+{
+	int active;
+
+	active = callout_active(&vrtc->callout) ? 1 : 0;
+	KASSERT((freq == 0 && !active) || (freq != 0 && active),
+	    ("vrtc callout %s with frequency %#lx",
+	    active ? "active" : "inactive", freq));
+}
+
+static void
+vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval)
+{
+	struct rtcdev *rtc;
+	int oldirqf, newirqf;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE;
+
+	oldirqf = rtc->reg_c & RTCIR_INT;
+	if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) ||
+	    (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) ||
+	    (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) {
+		newirqf = RTCIR_INT;
+	} else {
+		newirqf = 0;
+	}
+
+	oldval = rtc->reg_c;
+	rtc->reg_c = newirqf | newval;
+	changed = oldval ^ rtc->reg_c;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x",
+		    oldval, rtc->reg_c);
+	}
+
+	if (!oldirqf && newirqf) {
+		VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ);
+		vatpic_pulse_irq(vrtc->vm, RTC_IRQ);
+		vioapic_pulse_irq(vrtc->vm, RTC_IRQ);
+	} else if (oldirqf && !newirqf) {
+		VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ);
+	}
+}
+
+static int
+vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
+{
+	struct rtcdev *rtc;
+	sbintime_t oldfreq, newfreq;
+	time_t curtime, rtctime;
+	int error;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	oldval = rtc->reg_b;
+	oldfreq = vrtc_freq(vrtc);
+
+	rtc->reg_b = newval;
+	changed = oldval ^ newval;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x",
+		    oldval, newval);
+	}
+
+	if (changed & RTCSB_HALT) {
+		if ((newval & RTCSB_HALT) == 0) {
+			rtctime = rtc_to_secs(vrtc);
+			if (rtctime == VRTC_BROKEN_TIME) {
+				/*
+				 * Stop updating the RTC if the date/time
+				 * programmed by the guest is not correct.
+				 */
+				VM_CTR0(vrtc->vm, "Invalid RTC date/time "
+				    "programming detected");
+
+				if (rtc_flag_broken_time)
+					return (-1);
+			}
+		} else {
+			curtime = vrtc_curtime(vrtc);
+			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
+			    "between vrtc basetime (%#lx) and curtime (%#lx)",
+			    __func__, vrtc->base_rtctime, curtime));
+
+			/*
+			 * Force a refresh of the RTC date/time fields so
+			 * they reflect the time right before the guest set
+			 * the HALT bit.
+			 */
+			secs_to_rtc(curtime, vrtc, 1);
+
+			/*
+			 * Updates are halted so mark 'base_rtctime' to denote
+			 * that the RTC date/time is in flux.
+			 */
+			rtctime = VRTC_BROKEN_TIME;
+			rtc->reg_b &= ~RTCSB_UINTR;
+		}
+		error = vrtc_time_update(vrtc, rtctime);
+		KASSERT(error == 0, ("vrtc_time_update error %d", error));
+	}
+
+	/*
+	 * Side effect of changes to the interrupt enable bits.
+	 */
+	if (changed & RTCSB_ALL_INTRS)
+		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c);
+
+	/*
+	 * Change the callout frequency if it has changed.
+	 */
+	newfreq = vrtc_freq(vrtc);
+	if (newfreq != oldfreq)
+		vrtc_callout_reset(vrtc, newfreq);
+	else
+		vrtc_callout_check(vrtc, newfreq);
+
+	/*
+	 * The side effect of bits that control the RTC date/time format
+	 * is handled lazily when those fields are actually read.
+	 */
+	return (0);
+}
+
+static void
+vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval)
+{
+	sbintime_t oldfreq, newfreq;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	newval &= ~RTCSA_TUP;
+	oldval = vrtc->rtcdev.reg_a;
+	oldfreq = vrtc_freq(vrtc);
+
+	if (divider_enabled(oldval) && !divider_enabled(newval)) {
+		VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx",
+		    vrtc->base_rtctime, vrtc->base_uptime);
+	} else if (!divider_enabled(oldval) && divider_enabled(newval)) {
+		/*
+		 * If the dividers are coming out of reset then update
+		 * 'base_uptime' before this happens. This is done to
+		 * maintain the illusion that the RTC date/time was frozen
+		 * while the dividers were disabled.
+		 */
+		vrtc->base_uptime = sbinuptime();
+		VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx",
+		    vrtc->base_rtctime, vrtc->base_uptime);
+	} else {
+		/* NOTHING */
+	}
+
+	vrtc->rtcdev.reg_a = newval;
+	changed = oldval ^ newval;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x",
+		    oldval, newval);
+	}
+
+	/*
+	 * Side effect of changes to rate select and divider enable bits.
+	 */
+	newfreq = vrtc_freq(vrtc);
+	if (newfreq != oldfreq)
+		vrtc_callout_reset(vrtc, newfreq);
+	else
+		vrtc_callout_check(vrtc, newfreq);
+}
+
+int
+vrtc_set_time(struct vm *vm, time_t secs)
+{
+	struct vrtc *vrtc;
+	int error;
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+	error = vrtc_time_update(vrtc, secs);
+	VRTC_UNLOCK(vrtc);
+
+	if (error) {
+		VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error,
+		    secs);
+	} else {
+		VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs);
+	}
+
+	return (error);
+}
+
+time_t
+vrtc_get_time(struct vm *vm)
+{
+	struct vrtc *vrtc;
+	time_t t;
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+	t = vrtc_curtime(vrtc);
+	VRTC_UNLOCK(vrtc);
+
+	return (t);
+}
+
+int
+vrtc_nvram_write(struct vm *vm, int offset, uint8_t value)
+{
+	struct vrtc *vrtc;
+	uint8_t *ptr;
+
+	vrtc = vm_rtc(vm);
+
+	/*
+	 * Don't allow writes to RTC control registers or the date/time fields.
+	 */
+	if (offset < offsetof(struct rtcdev, nvram[0]) ||
+	    offset >= sizeof(struct rtcdev)) {
+		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
+		    offset);
+		return (EINVAL);
+	}
+
+	VRTC_LOCK(vrtc);
+	ptr = (uint8_t *)(&vrtc->rtcdev);
+	ptr[offset] = value;
+	VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset);
+	VRTC_UNLOCK(vrtc);
+
+	return (0);
+}
+
+int
+vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
+{
+	struct vrtc *vrtc;
+	time_t curtime;
+	uint8_t *ptr;
+
+	/*
+	 * Allow all offsets in the RTC to be read.
+	 */
+	if (offset < 0 || offset >= sizeof(struct rtcdev))
+		return (EINVAL);
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+
+	/*
+	 * Update RTC date/time fields if necessary.
+	 */
+	if (offset < 10) {
+		curtime = vrtc_curtime(vrtc);
+		secs_to_rtc(curtime, vrtc, 0);
+	}
+
+	ptr = (uint8_t *)(&vrtc->rtcdev);
+	*retval = ptr[offset];
+
+	VRTC_UNLOCK(vrtc);
+	return (0);
+}
+
+int
+vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	struct vrtc *vrtc;
+
+	vrtc = vm_rtc(vm);
+
+	if (bytes != 1)
+		return (-1);
+
+	if (in) {
+		*val = 0xff;
+		return (0);
+	}
+
+	VRTC_LOCK(vrtc);
+	vrtc->addr = *val & 0x7f;
+	VRTC_UNLOCK(vrtc);
+
+	return (0);
+}
+
+int
+vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	struct vrtc *vrtc;
+	struct rtcdev *rtc;
+	time_t curtime;
+	int error, offset;
+
+	vrtc = vm_rtc(vm);
+	rtc = &vrtc->rtcdev;
+
+	if (bytes != 1)
+		return (-1);
+
+	VRTC_LOCK(vrtc);
+	offset = vrtc->addr;
+	if (offset >= sizeof(struct rtcdev)) {
+		VRTC_UNLOCK(vrtc);
+		return (-1);
+	}
+
+	error = 0;
+	curtime = vrtc_curtime(vrtc);
+	vrtc_time_update(vrtc, curtime);
+
+	if (in) {
+		/*
+		 * Update RTC date/time fields if necessary.
+		 */
+		if (offset < 10)
+			secs_to_rtc(curtime, vrtc, 0);
+
+		if (offset == 12) {
+			/*
+			 * XXX
+			 * reg_c interrupt flags are updated only if the
+			 * corresponding interrupt enable bit in reg_b is set.
+			 */
+			*val = vrtc->rtcdev.reg_c;
+			vrtc_set_reg_c(vrtc, 0);
+		} else {
+			*val = *((uint8_t *)rtc + offset);
+		}
+		VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x",
+		    *val, offset);
+	} else {
+		switch (offset) {
+		case 10:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val);
+			vrtc_set_reg_a(vrtc, *val);
+			break;
+		case 11:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val);
+			error = vrtc_set_reg_b(vrtc, *val);
+			break;
+		case 12:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)",
+			    *val);
+			break;
+		case 13:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)",
+			    *val);
+			break;
+		case 0:
+			/*
+			 * High order bit of 'seconds' is readonly.
+			 */
+			*val &= 0x7f;
+			/* FALLTHRU */
+		default:
+			VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x",
+			    offset, *val);
+			*((uint8_t *)rtc + offset) = *val;
+			break;
+		}
+	}
+	VRTC_UNLOCK(vrtc);
+	return (error);
+}
+
+void
+vrtc_reset(struct vrtc *vrtc)
+{
+	struct rtcdev *rtc;
+
+	VRTC_LOCK(vrtc);
+
+	rtc = &vrtc->rtcdev;
+	vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE));
+	vrtc_set_reg_c(vrtc, 0);
+	KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active"));
+
+	VRTC_UNLOCK(vrtc);
+}
+
+struct vrtc *
+vrtc_init(struct vm *vm)
+{
+	struct vrtc *vrtc;
+	struct rtcdev *rtc;
+	time_t curtime;
+
+	vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO);
+	vrtc->vm = vm;
+	mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF);
+	callout_init(&vrtc->callout, 1);
+
+	/* Allow dividers to keep time but disable everything else */
+	rtc = &vrtc->rtcdev;
+	rtc->reg_a = 0x20;
+	rtc->reg_b = RTCSB_24HR;
+	rtc->reg_c = 0;
+	rtc->reg_d = RTCSD_PWR;
+
+	/* Reset the index register to a safe value. */
+	vrtc->addr = RTC_STATUSD;
+
+	/*
+	 * Initialize RTC time to 00:00:00 Jan 1, 1970.
+	 */
+	curtime = 0;
+
+	VRTC_LOCK(vrtc);
+	vrtc->base_rtctime = VRTC_BROKEN_TIME;
+	vrtc_time_update(vrtc, curtime);
+	secs_to_rtc(curtime, vrtc, 0);
+	VRTC_UNLOCK(vrtc);
+
+	return (vrtc);
+}
+
+void
+vrtc_cleanup(struct vrtc *vrtc)
+{
+
+	callout_drain(&vrtc->callout);
+	free(vrtc, M_VRTC);
+}
diff --git a/sys/amd64/vmm/io/vrtc.h b/sys/amd64/vmm/io/vrtc.h
new file mode 100644
index 0000000..6fbbc9c
--- /dev/null
+++ b/sys/amd64/vmm/io/vrtc.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VRTC_H_
+#define	_VRTC_H_
+
+#include <isa/isareg.h>
+
+struct vrtc;
+
+struct vrtc *vrtc_init(struct vm *vm);
+void vrtc_cleanup(struct vrtc *vrtc);
+void vrtc_reset(struct vrtc *vrtc);
+
+time_t vrtc_get_time(struct vm *vm);
+int vrtc_set_time(struct vm *vm, time_t secs);
+int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value);
+int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval);
+
+int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val);
+int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val);
+
+#endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 4739a86..7f90c61 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$");
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vpmtmr.h"
+#include "vrtc.h"
 #include "vmm_ipi.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
@@ -100,12 +101,15 @@ struct vcpu {
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
-	struct vm_exception exception;	/* (x) exception collateral */
 	int	exception_pending;	/* (i) exception pending */
+	int	exc_vector;		/* (x) exception collateral */
+	int	exc_errcode_valid;
+	uint32_t exc_errcode;
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
+	uint64_t	nextrip;	/* (x) next instruction to execute */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
@@ -136,6 +140,7 @@ struct vm {
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
+	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
@@ -376,6 +381,8 @@ vm_init(struct vm *vm, bool create)
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 	vm->vpmtmr = vpmtmr_init(vm);
+	if (create)
+		vm->vrtc = vrtc_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 
@@ -438,6 +445,10 @@ vm_cleanup(struct vm *vm, bool destroy)
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
+	if (destroy)
+		vrtc_cleanup(vm->vrtc);
+	else
+		vrtc_reset(vm->vrtc);
 	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
@@ -841,16 +852,26 @@ vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 }
 
 int
-vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
+	struct vcpu *vcpu;
+	int error;
 
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
-	return (VMSETREG(vm->cookie, vcpu, reg, val));
+	error = VMSETREG(vm->cookie, vcpuid, reg, val);
+	if (error || reg != VM_REG_GUEST_RIP)
+		return (error);
+
+	/* Set 'nextrip' to match the value of %rip */
+	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
+	vcpu = &vm->vcpu[vcpuid];
+	vcpu->nextrip = val;
+	return (0);
 }
 
 static boolean_t
@@ -1102,7 +1123,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
-	int error, t, vcpu_halted, vm_halted;
+	int t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
@@ -1110,22 +1131,6 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 	vcpu_halted = 0;
 	vm_halted = 0;
 
-	/*
-	 * The typical way to halt a cpu is to execute: "sti; hlt"
-	 *
-	 * STI sets RFLAGS.IF to enable interrupts. However, the processor
-	 * remains in an "interrupt shadow" for an additional instruction
-	 * following the STI. This guarantees that "sti; hlt" sequence is
-	 * atomic and a pending interrupt will be recognized after the HLT.
-	 *
-	 * After the HLT emulation is done the vcpu is no longer in an
-	 * interrupt shadow and a pending interrupt can be injected on
-	 * the next entry into the guest.
-	 */
-	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
-	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
-	    __func__, error));
-
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
@@ -1206,6 +1211,9 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
+	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
+	    __func__, vme->inst_length));
+
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
@@ -1231,9 +1239,6 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
-	/* restart execution at the faulting instruction */
-	vme->inst_length = 0;
-
 	return (0);
 }
 
@@ -1288,10 +1293,13 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 		return (EFAULT);
 
 	/*
-	 * If the instruction length is not specified the update it now.
+	 * If the instruction length was not specified then update it now
+	 * along with 'nextrip'.
 	 */
-	if (vme->inst_length == 0)
+	if (vme->inst_length == 0) {
 		vme->inst_length = vie->num_processed;
+		vcpu->nextrip += vie->num_processed;
+	}
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
@@ -1440,7 +1448,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
-	uint64_t tscval, rip;
+	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
@@ -1462,7 +1470,6 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
-	rip = vmrun->rip;
 restart:
 	critical_enter();
 
@@ -1477,7 +1484,7 @@ restart:
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
-	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
+	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
@@ -1488,6 +1495,7 @@ restart:
 
 	if (error == 0) {
 		retu = false;
+		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
@@ -1524,10 +1532,8 @@ restart:
 		}
 	}
 
-	if (error == 0 && retu == false) {
-		rip = vme->rip + vme->inst_length;
+	if (error == 0 && retu == false)
 		goto restart;
-	}
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
@@ -1535,6 +1541,49 @@ restart:
 }
 
 int
+vm_restart_instruction(void *arg, int vcpuid)
+{
+	struct vm *vm;
+	struct vcpu *vcpu;
+	enum vcpu_state state;
+	uint64_t rip;
+	int error;
+
+	vm = arg;
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	state = vcpu_get_state(vm, vcpuid, NULL);
+	if (state == VCPU_RUNNING) {
+		/*
+		 * When a vcpu is "running" the next instruction is determined
+		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
+		 * Thus setting 'inst_length' to zero will cause the current
+		 * instruction to be restarted.
+		 */
+		vcpu->exitinfo.inst_length = 0;
+		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
+		    "setting inst_length to zero", vcpu->exitinfo.rip);
+	} else if (state == VCPU_FROZEN) {
+		/*
+		 * When a vcpu is "frozen" it is outside the critical section
+		 * around VMRUN() and 'nextrip' points to the next instruction.
+		 * Thus instruction restart is achieved by setting 'nextrip'
+		 * to the vcpu's %rip.
+		 */
+		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
+		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
+		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
+		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
+		vcpu->nextrip = rip;
+	} else {
+		panic("%s: invalid state %d", __func__, state);
+	}
+	return (0);
+}
+
+int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
@@ -1664,11 +1713,11 @@ vcpu_exception_intinfo(struct vcpu *vcpu)
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
-		info = vcpu->exception.vector & 0xff;
+		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
-		if (vcpu->exception.error_code_valid) {
+		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
-			info |= (uint64_t)vcpu->exception.error_code << 32;
+			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
@@ -1693,7 +1742,7 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
-		    vcpu->exception.vector, info2);
+		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
@@ -1731,14 +1780,16 @@ vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 }
 
 int
-vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
+vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
+    uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
+	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
-	if (exception->vector < 0 || exception->vector >= 32)
+	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
 	/*
@@ -1746,21 +1797,35 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
-	if (exception->vector == IDT_DF)
+	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
-		    "pending exception %d", exception->vector,
-		    vcpu->exception.vector);
+		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
+	/*
+	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
+	 *
+	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
+	 * one instruction or incurs an exception.
+	 */
+	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
+	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
+	    __func__, error));
+
+	if (restart_instruction)
+		vm_restart_instruction(vm, vcpuid);
+
 	vcpu->exception_pending = 1;
-	vcpu->exception = *exception;
-	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
+	vcpu->exc_vector = vector;
+	vcpu->exc_errcode = errcode;
+	vcpu->exc_errcode_valid = errcode_valid;
+	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
 	return (0);
 }
 
@@ -1768,28 +1833,15 @@ void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
-	struct vm_exception exception;
-	struct vm_exit *vmexit;
 	struct vm *vm;
-	int error;
+	int error, restart_instruction;
 
 	vm = vmarg;
+	restart_instruction = 1;
 
-	exception.vector = vector;
-	exception.error_code = errcode;
-	exception.error_code_valid = errcode_valid;
-	error = vm_inject_exception(vm, vcpuid, &exception);
+	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
+	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
-
-	/*
-	 * A fault-like exception allows the instruction to be restarted
-	 * after the exception handler returns.
-	 *
-	 * By setting the inst_length to 0 we ensure that the instruction
-	 * pointer remains at the faulting instruction.
-	 */
-	vmexit = vm_exitinfo(vm, vcpuid);
-	vmexit->inst_length = 0;
 }
 
 void
@@ -2223,6 +2275,13 @@ vm_pmtmr(struct vm *vm)
 	return (vm->vpmtmr);
 }
 
+struct vrtc *
+vm_rtc(struct vm *vm)
+{
+
+	return (vm->vrtc);
+}
+
 enum vm_reg_name
 vm_segment_name(int seg)
 {
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index a85109e..0293d191 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include "io/vatpic.h"
 #include "io/vioapic.h"
 #include "io/vhpet.h"
+#include "io/vrtc.h"
 
 struct vmmdev_softc {
 	struct vm	*vm;		/* vm instance cookie */
@@ -174,6 +175,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
 	struct vm_intinfo *vmii;
+	struct vm_rtc_time *rtctime;
+	struct vm_rtc_data *rtcdata;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -202,6 +205,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_ACTIVATE_CPU:
 	case VM_SET_INTINFO:
 	case VM_GET_INTINFO:
+	case VM_RESTART_INSTRUCTION:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -307,7 +311,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		break;
 	case VM_INJECT_EXCEPTION:
 		vmexc = (struct vm_exception *)data;
-		error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc);
+		error = vm_inject_exception(sc->vm, vmexc->cpuid,
+		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
+		    vmexc->restart_instruction);
 		break;
 	case VM_INJECT_NMI:
 		vmnmi = (struct vm_nmi *)data;
@@ -482,6 +488,28 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
 		    &vmii->info2);
 		break;
+	case VM_RTC_WRITE:
+		rtcdata = (struct vm_rtc_data *)data;
+		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
+		    rtcdata->value);
+		break;
+	case VM_RTC_READ:
+		rtcdata = (struct vm_rtc_data *)data;
+		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
+		    &rtcdata->value);
+		break;
+	case VM_RTC_SETTIME:
+		rtctime = (struct vm_rtc_time *)data;
+		error = vrtc_set_time(sc->vm, rtctime->secs);
+		break;
+	case VM_RTC_GETTIME:
+		error = 0;
+		rtctime = (struct vm_rtc_time *)data;
+		rtctime->secs = vrtc_get_time(sc->vm);
+		break;
+	case VM_RESTART_INSTRUCTION:
+		error = vm_restart_instruction(sc->vm, vcpu);
+		break;
 	default:
 		error = ENOTTY;
 		break;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index d1d7173..3db890e 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -70,6 +70,7 @@ enum {
 	VIE_OP_TYPE_PUSH,
 	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_POP,
+	VIE_OP_TYPE_MOVS,
 	VIE_OP_TYPE_LAST
 };
 
@@ -78,6 +79,7 @@ enum {
 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
 #define	VIE_OP_F_NO_MODRM	(1 << 3)
+#define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
@@ -133,6 +135,16 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
+	[0xA4] = {
+		.op_byte = 0xA4,
+		.op_type = VIE_OP_TYPE_MOVS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xA5] = {
+		.op_byte = 0xA5,
+		.op_type = VIE_OP_TYPE_MOVS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
@@ -559,6 +571,217 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+/*
+ * Helper function to calculate and validate a linear address.
+ *
+ * Returns 0 on success and 1 if an exception was injected into the guest.
+ */
+static int
+get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
+    int opsize, int addrsize, int prot, enum vm_reg_name seg,
+    enum vm_reg_name gpr, uint64_t *gla)
+{
+	struct seg_desc desc;
+	uint64_t cr0, val, rflags;
+	int error;
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
+	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
+	    __func__, error, seg));
+
+	error = vie_read_register(vm, vcpuid, gpr, &val);
+	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
+	    error, gpr));
+
+	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
+	    addrsize, prot, gla)) {
+		if (seg == VM_REG_GUEST_SS)
+			vm_inject_ss(vm, vcpuid, 0);
+		else
+			vm_inject_gp(vm, vcpuid);
+		return (1);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, *gla)) {
+		if (seg == VM_REG_GUEST_SS)
+			vm_inject_ss(vm, vcpuid, 0);
+		else
+			vm_inject_gp(vm, vcpuid);
+		return (1);
+	}
+
+	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (1);
+	}
+
+	return (0);
+}
+
+static int
+emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	uint64_t dstaddr, srcaddr, val;
+	uint64_t rcx, rdi, rsi, rflags;
+	int error, opsize, seg, repeat;
+
+	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
+	val = 0;
+	error = 0;
+
+	/*
+	 * XXX although the MOVS instruction is only supposed to be used with
+	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
+	 *
+	 * Empirically the "repnz" prefix has identical behavior to "rep"
+	 * and the zero flag does not make a difference.
+	 */
+	repeat = vie->repz_present | vie->repnz_present;
+
+	if (repeat) {
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+		/*
+		 * The count register is %rcx, %ecx or %cx depending on the
+		 * address size of the instruction.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+			return (0);
+	}
+
+	/*
+	 *	Source		Destination	Comments
+	 *	--------------------------------------------
+	 * (1)  memory		memory		n/a
+	 * (2)  memory		mmio		emulated
+	 * (3)  mmio		memory		emulated
+	 * (4)  mmio		mmio		not emulated
+	 *
+	 * At this point we don't have sufficient information to distinguish
+	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
+	 * out because it will succeed only when operating on regular memory.
+	 *
+	 * XXX the emulation doesn't properly handle the case where 'gpa'
+	 * is straddling the boundary between the normal memory and MMIO.
+	 */
+
+	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
+	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
+	if (error)
+		goto done;
+
+	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		/*
+		 * case (2): read from system memory and write to mmio.
+		 */
+		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+		goto done;
+	} else if (error > 0) {
+		/*
+		 * Resume guest execution to handle fault.
+		 */
+		goto done;
+	} else {
+		/*
+		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
+		 * if 'srcaddr' is in the mmio space.
+		 */
+	}
+
+	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+	    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
+	if (error)
+		goto done;
+
+	error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
+	    PROT_WRITE, copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		/*
+		 * case (3): read from MMIO and write to system memory.
+		 *
+		 * A MMIO read can have side-effects so we commit to it
+		 * only after vm_copy_setup() is successful. If a page-fault
+		 * needs to be injected into the guest then it will happen
+		 * before the MMIO read is attempted.
+		 */
+		error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+		if (error)
+			goto done;
+
+		vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+	} else if (error > 0) {
+		/*
+		 * Resume guest execution to handle fault.
+		 */
+		goto done;
+	} else {
+		goto done;
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
+	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	if (rflags & PSL_D) {
+		rsi -= opsize;
+		rdi -= opsize;
+	} else {
+		rsi += opsize;
+		rdi += opsize;
+	}
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+	if (repeat) {
+		rcx = rcx - 1;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+		    rcx, vie->addrsize);
+		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+		/*
+		 * Repeat the instruction if the count register is not zero.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+			vm_restart_instruction(vm, vcpuid);
+	}
+done:
+	if (error < 0)
+		return (EFAULT);
+	else
+		return (0);
+}
+
 static int
 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -926,9 +1149,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
 		rsp += size;
 	}
-#ifdef _KERNEL
 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-#endif
 
 	if (error == 0) {
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
@@ -1012,6 +1233,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_movx(vm, vcpuid, gpa, vie,
 				     memread, memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_MOVS:
+		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_AND:
 		error = emulate_and(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
@@ -1193,6 +1418,7 @@ vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
 
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
+	vie->segment_register = VM_REG_LAST;
 
 	if (inst_length) {
 		bcopy(inst_bytes, vie->inst, inst_length);
@@ -1458,6 +1684,35 @@ vie_advance(struct vie *vie)
 	vie->num_processed++;
 }
 
+static bool
+segment_override(uint8_t x, int *seg)
+{
+
+	switch (x) {
+	case 0x2E:
+		*seg = VM_REG_GUEST_CS;
+		break;
+	case 0x36:
+		*seg = VM_REG_GUEST_SS;
+		break;
+	case 0x3E:
+		*seg = VM_REG_GUEST_DS;
+		break;
+	case 0x26:
+		*seg = VM_REG_GUEST_ES;
+		break;
+	case 0x64:
+		*seg = VM_REG_GUEST_FS;
+		break;
+	case 0x65:
+		*seg = VM_REG_GUEST_GS;
+		break;
+	default:
+		return (false);
+	}
+	return (true);
+}
+
 static int
 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
@@ -1471,6 +1726,12 @@ decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 			vie->opsize_override = 1;
 		else if (x == 0x67)
 			vie->addrsize_override = 1;
+		else if (x == 0xF3)
+			vie->repz_present = 1;
+		else if (x == 0xF2)
+			vie->repnz_present = 1;
+		else if (segment_override(x, &vie->segment_register))
+			vie->segment_override = 1;
 		else
 			break;
 
@@ -1923,8 +2184,10 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 	if (verify_inst_length(vie))
 		return (-1);
 
-	if (verify_gla(vm, cpuid, gla, vie))
-		return (-1);
+	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
+		if (verify_gla(vm, cpuid, gla, vie))
+			return (-1);
+	}
 
 	vie->decoded = 1;	/* success */
 
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index e553599..fc68a61 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vpmtmr.h"
+#include "vrtc.h"
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 
@@ -60,6 +61,8 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = {
 	[IO_ELCR1] = vatpic_elc_handler,
 	[IO_ELCR2] = vatpic_elc_handler,
 	[IO_PMTMR] = vpmtmr_handler,
+	[IO_RTC] = vrtc_addr_handler,
+	[IO_RTC + 1] = vrtc_data_handler,
 };
 
 #ifdef KTR
@@ -71,7 +74,7 @@ inout_instruction(struct vm_exit *vmexit)
 	static const char *iodesc[] = {
 		"outb", "outw", "outl",
 		"inb", "inw", "inl",
-		"outsb", "outsw", "outsd"
+		"outsb", "outsw", "outsd",
 		"insb", "insw", "insd",
 	};
author	neel <neel@FreeBSD.org>	2015-06-27 22:48:22 +0000
committer	neel <neel@FreeBSD.org>	2015-06-27 22:48:22 +0000
commit	115742fae3f7a7c52d6d5f4894f37e68dff4fd5c (patch)
tree	cb845c4cb2d3a3b67b3e1134742c3c5b250ae954 /sys/amd64/vmm
parent	02efaba1d135756ed65855bdc99e7d83f46cc4a2 (diff)
download	FreeBSD-src-115742fae3f7a7c52d6d5f4894f37e68dff4fd5c.zip FreeBSD-src-115742fae3f7a7c52d6d5f4894f37e68dff4fd5c.tar.gz