30 files changed, 1875 insertions, 564 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 93955c7..9828876 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -368,14 +368,13 @@ vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
 }
 
 int
-vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
+vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
 {
 	int error;
 	struct vm_run vmrun;
 
 	bzero(&vmrun, sizeof(vmrun));
 	vmrun.cpuid = vcpu;
-	vmrun.rip = rip;
 
 	error = ioctl(ctx->fd, VM_RUN, &vmrun);
 	bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
@@ -399,36 +398,22 @@ vm_reinit(struct vmctx *ctx)
 	return (ioctl(ctx->fd, VM_REINIT, 0));
 }
 
-static int
-vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector,
-    int error_code, int error_code_valid)
+int
+vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
+    uint32_t errcode, int restart_instruction)
 {
 	struct vm_exception exc;
 
-	bzero(&exc, sizeof(exc));
 	exc.cpuid = vcpu;
 	exc.vector = vector;
-	exc.error_code = error_code;
-	exc.error_code_valid = error_code_valid;
+	exc.error_code = errcode;
+	exc.error_code_valid = errcode_valid;
+	exc.restart_instruction = restart_instruction;
 
 	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
 }
 
 int
-vm_inject_exception(struct vmctx *ctx, int vcpu, int vector)
-{
-
-	return (vm_inject_exception_real(ctx, vcpu, vector, 0, 0));
-}
-
-int
-vm_inject_exception2(struct vmctx *ctx, int vcpu, int vector, int errcode)
-{
-
-	return (vm_inject_exception_real(ctx, vcpu, vector, errcode, 1));
-}
-
-int
 vm_apicid2vcpu(struct vmctx *ctx, int apicid)
 {
 	/*
@@ -1002,6 +987,7 @@ int
 vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
 {
+	void *va;
 	uint64_t gpa;
 	int error, fault, i, n, off;
 
@@ -1021,7 +1007,11 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 		off = gpa & PAGE_MASK;
 		n = min(len, PAGE_SIZE - off);
 
-		iov->iov_base = (void *)gpa;
+		va = vm_map_gpa(ctx, gpa, n);
+		if (va == NULL)
+			return (-1);
+
+		iov->iov_base = va;
 		iov->iov_len = n;
 		iov++;
 		iovcnt--;
@@ -1033,19 +1023,24 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 }
 
 void
+vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt)
+{
+
+	return;
+}
+
+void
 vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
 {
 	const char *src;
 	char *dst;
-	uint64_t gpa;
 	size_t n;
 
 	dst = vp;
 	while (len) {
 		assert(iov->iov_len);
-		gpa = (uint64_t)iov->iov_base;
 		n = min(len, iov->iov_len);
-		src = vm_map_gpa(ctx, gpa, n);
+		src = iov->iov_base;
 		bcopy(src, dst, n);
 
 		iov++;
@@ -1060,15 +1055,13 @@ vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
 {
 	const char *src;
 	char *dst;
-	uint64_t gpa;
 	size_t n;
 
 	src = vp;
 	while (len) {
 		assert(iov->iov_len);
-		gpa = (uint64_t)iov->iov_base;
 		n = min(len, iov->iov_len);
-		dst = vm_map_gpa(ctx, gpa, n);
+		dst = iov->iov_base;
 		bcopy(src, dst, n);
 
 		iov++;
@@ -1146,3 +1139,63 @@ vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
 	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
 	return (error);
 }
+
+int
+vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value)
+{
+	struct vm_rtc_data rtcdata;
+	int error;
+
+	bzero(&rtcdata, sizeof(struct vm_rtc_data));
+	rtcdata.offset = offset;
+	rtcdata.value = value;
+	error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata);
+	return (error);
+}
+
+int
+vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval)
+{
+	struct vm_rtc_data rtcdata;
+	int error;
+
+	bzero(&rtcdata, sizeof(struct vm_rtc_data));
+	rtcdata.offset = offset;
+	error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata);
+	if (error == 0)
+		*retval = rtcdata.value;
+	return (error);
+}
+
+int
+vm_rtc_settime(struct vmctx *ctx, time_t secs)
+{
+	struct vm_rtc_time rtctime;
+	int error;
+
+	bzero(&rtctime, sizeof(struct vm_rtc_time));
+	rtctime.secs = secs;
+	error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime);
+	return (error);
+}
+
+int
+vm_rtc_gettime(struct vmctx *ctx, time_t *secs)
+{
+	struct vm_rtc_time rtctime;
+	int error;
+
+	bzero(&rtctime, sizeof(struct vm_rtc_time));
+	error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime);
+	if (error == 0)
+		*secs = rtctime.secs;
+	return (error);
+}
+
+int
+vm_restart_instruction(void *arg, int vcpu)
+{
+	struct vmctx *ctx = arg;
+
+	return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index fbb6ddd..06b2930 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -32,6 +32,12 @@
 #include <sys/param.h>
 #include <sys/cpuset.h>
 
+/*
+ * API version for out-of-tree consumers like grub-bhyve for making compile
+ * time decisions.
+ */
+#define	VMMAPI_VERSION	0101	/* 2 digit major followed by 2 digit minor */
+
 struct iovec;
 struct vmctx;
 enum x2apic_state;
@@ -70,13 +76,12 @@ int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
 			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
-int	vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
-	       struct vm_exit *ret_vmexit);
+int	vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
 int	vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
 int	vm_reinit(struct vmctx *ctx);
 int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
-int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vec);
-int	vm_inject_exception2(struct vmctx *ctx, int vcpu, int vec, int errcode);
+int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
+    int errcode_valid, uint32_t errcode, int restart_instruction);
 int	vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
@@ -132,6 +137,14 @@ void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 	    struct iovec *guest_iov, size_t len);
+void	vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
+	    int iovcnt);
+
+/* RTC */
+int	vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
+int	vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval);
+int	vm_rtc_settime(struct vmctx *ctx, time_t secs);
+int	vm_rtc_gettime(struct vmctx *ctx, time_t *secs);
 
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
diff --git a/share/examples/bhyve/vmrun.sh b/share/examples/bhyve/vmrun.sh
index e1cbfcf..d3d5cdf 100755
--- a/share/examples/bhyve/vmrun.sh
+++ b/share/examples/bhyve/vmrun.sh
@@ -39,7 +39,13 @@ DEFAULT_CONSOLE=stdio
 DEFAULT_VIRTIO_DISK="./diskdev"
 DEFAULT_ISOFILE="./release.iso"
 
+errmsg() {
+	echo "*** $1"
+}
+
 usage() {
+	local msg=$1
+
 	echo "Usage: vmrun.sh [-ahi] [-c <CPUs>] [-C <console>] [-d <disk file>]"
 	echo "                [-e <name=value>] [-g <gdbport> ] [-H <directory>]"
 	echo "                [-I <location of installation iso>] [-m <memsize>]"
@@ -58,18 +64,18 @@ usage() {
 	echo "       -m: memory size (default is ${DEFAULT_MEMSIZE})"
 	echo "       -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)"
 	echo ""
-	echo "       This script needs to be executed with superuser privileges"
-	echo ""
+	[ -n "$msg" ] && errmsg "$msg"
 	exit 1
 }
 
 if [ `id -u` -ne 0 ]; then
-	usage
+	errmsg "This script must be executed with superuser privileges"
+	exit 1
 fi
 
 kldstat -n vmm > /dev/null 2>&1 
 if [ $? -ne 0 ]; then
-	echo "vmm.ko is not loaded!"
+	errmsg "vmm.ko is not loaded"
 	exit 1
 fi
 
@@ -143,7 +149,7 @@ fi
 shift $((${OPTIND} - 1))
 
 if [ $# -ne 1 ]; then
-	usage
+	usage "virtual machine name not specified"
 fi
 
 vmname="$1"
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 8a8c3f4..cf7f5bc 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -286,9 +286,10 @@ int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 struct vpmtmr *vm_pmtmr(struct vm *vm);
+struct vrtc *vm_rtc(struct vm *vm);
 
 /*
- * Inject exception 'vme' into the guest vcpu. This function returns 0 on
+ * Inject exception 'vector' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
@@ -298,7 +299,8 @@ struct vpmtmr *vm_pmtmr(struct vm *vm);
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
-int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
+int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
+    uint32_t errcode, int restart_instruction);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
@@ -444,8 +446,11 @@ struct vie {
 			rex_x:1,
 			rex_b:1,
 			rex_present:1,
+			repz_present:1,		/* REP/REPE/REPZ prefix */
+			repnz_present:1,	/* REPNE/REPNZ prefix */
 			opsize_override:1,	/* Operand size override */
-			addrsize_override:1;	/* Address size override */
+			addrsize_override:1,	/* Address size override */
+			segment_override:1;	/* Segment override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
@@ -461,6 +466,7 @@ struct vie {
 	uint8_t		scale;
 	int		base_register;		/* VM_REG_GUEST_xyz */
 	int		index_register;		/* VM_REG_GUEST_xyz */
+	int		segment_register;	/* VM_REG_GUEST_xyz */
 
 	int64_t		displacement;		/* optional addr displacement */
 	int64_t		immediate;		/* optional immediate operand */
@@ -627,4 +633,6 @@ vm_inject_ss(void *vm, int vcpuid, int errcode)
 
 void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
 
+int vm_restart_instruction(void *vm, int vcpuid);
+
 #endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index e4d839e..9d031a9 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -54,7 +54,6 @@ struct vm_seg_desc {			/* data or code segment */
 
 struct vm_run {
 	int		cpuid;
-	uint64_t	rip;		/* start running here */
 	struct vm_exit	vm_exit;
 };
 
@@ -63,6 +62,7 @@ struct vm_exception {
 	int		vector;
 	uint32_t	error_code;
 	int		error_code_valid;
+	int		restart_instruction;
 };
 
 struct vm_lapic_msi {
@@ -195,6 +195,15 @@ struct vm_intinfo {
 	uint64_t	info2;
 };
 
+struct vm_rtc_time {
+	time_t		secs;
+};
+
+struct vm_rtc_data {
+	int		offset;
+	uint8_t		value;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@@ -228,6 +237,7 @@ enum {
 	IOCNUM_LAPIC_MSI = 36,
 	IOCNUM_LAPIC_LOCAL_IRQ = 37,
 	IOCNUM_IOAPIC_PINCOUNT = 38,
+	IOCNUM_RESTART_INSTRUCTION = 39,
 
 	/* PCI pass-thru */
 	IOCNUM_BIND_PPTDEV = 40,
@@ -254,6 +264,12 @@ enum {
 	/* vm_cpuset */
 	IOCNUM_ACTIVATE_CPU = 90,
 	IOCNUM_GET_CPUSET = 91,
+
+	/* RTC */
+	IOCNUM_RTC_READ = 100,
+	IOCNUM_RTC_WRITE = 101,
+	IOCNUM_RTC_SETTIME = 102,
+	IOCNUM_RTC_GETTIME = 103,
 };
 
 #define	VM_RUN		\
@@ -336,4 +352,14 @@ enum {
 	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
 #define	VM_GET_INTINFO	\
 	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
+#define VM_RTC_WRITE \
+	_IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data)
+#define VM_RTC_READ \
+	_IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data)
+#define VM_RTC_SETTIME	\
+	_IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time)
+#define VM_RTC_GETTIME	\
+	_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
+#define	VM_RESTART_INSTRUCTION \
+	_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
 #endif
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index ab47041..88a846d 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -80,6 +80,7 @@ SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
 #define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
 #define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
 #define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
+#define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
 
 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
 				VMCB_CACHE_IOPM		|	\
@@ -554,6 +555,7 @@ svm_vminit(struct vm *vm, pmap_t pmap)
 	pml4_pa = svm_sc->nptp;
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu = svm_get_vcpu(svm_sc, i);
+		vcpu->nextrip = ~0;
 		vcpu->lastcpu = NOCPU;
 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
@@ -1200,7 +1202,6 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct svm_regctx *ctx;
-	struct vm_exception exception;
 	uint64_t code, info1, info2, val;
 	uint32_t eax, ecx, edx;
 	int error, errcode_valid, handled, idtvec, reflect;
@@ -1314,6 +1315,7 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 			/* fallthru */
 		default:
 			errcode_valid = 0;
+			info1 = 0;
 			break;
 		}
 		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
@@ -1322,14 +1324,10 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 
 		if (reflect) {
 			/* Reflect the exception back into the guest */
-			exception.vector = idtvec;
-			exception.error_code_valid = errcode_valid;
-			exception.error_code = errcode_valid ? info1 : 0;
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
-			    "%d/%#x into the guest", exception.vector,
-			    exception.error_code);
-			error = vm_inject_exception(svm_sc->vm, vcpu,
-			    &exception);
+			    "%d/%#x into the guest", idtvec, (int)info1);
+			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
+			    errcode_valid, info1, 0);
 			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 			    __func__, error));
 		}
@@ -1476,15 +1474,24 @@ svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
+	struct svm_vcpu *vcpustate;
 	uint8_t v_tpr;
 	int vector, need_intr_window, pending_apic_vector;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
+	vcpustate = svm_get_vcpu(sc, vcpu);
 
 	need_intr_window = 0;
 	pending_apic_vector = 0;
 
+	if (vcpustate->nextrip != state->rip) {
+		ctrl->intr_shadow = 0;
+		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
+		    "cleared due to rip change: %#lx/%#lx",
+		    vcpustate->nextrip, state->rip);
+	}
+
 	/*
 	 * Inject pending events or exceptions for this vcpu.
 	 *
@@ -1634,7 +1641,7 @@ done:
 	 * VMRUN.
 	 */
 	v_tpr = vlapic_get_cr8(vlapic);
-	KASSERT(v_tpr >= 0 && v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
+	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
 	if (ctrl->v_tpr != v_tpr) {
 		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
 		    ctrl->v_tpr, v_tpr);
@@ -1801,14 +1808,14 @@ static __inline void
 disable_gintr(void)
 {
 
-        __asm __volatile("clgi" : : :);
+	__asm __volatile("clgi");
 }
 
 static __inline void
 enable_gintr(void)
 {
 
-        __asm __volatile("stgi" : : :);
+        __asm __volatile("stgi");
 }
 
 /*
@@ -1955,6 +1962,9 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		/* #VMEXIT disables interrupts so re-enable them here. */ 
 		enable_gintr();
 
+		/* Update 'nextrip' */
+		vcpustate->nextrip = state->rip;
+
 		/* Handle #VMEXIT and if required return to user space. */
 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
 	} while (handled);
diff --git a/sys/amd64/vmm/amd/svm_softc.h b/sys/amd64/vmm/amd/svm_softc.h
index a5bb57c..de0c3f7 100644
--- a/sys/amd64/vmm/amd/svm_softc.h
+++ b/sys/amd64/vmm/amd/svm_softc.h
@@ -45,6 +45,7 @@ struct svm_vcpu {
 	struct vmcb	vmcb;	 /* hardware saved vcpu context */
 	struct svm_regctx swctx; /* software saved vcpu context */
 	uint64_t	vmcb_pa; /* VMCB physical address */
+	uint64_t	nextrip; /* next instruction to be executed by guest */
         int		lastcpu; /* host cpu that the vcpu last ran on */
 	uint32_t	dirty;	 /* state cache bits that must be cleared */
 	long		eptgen;	 /* pmap->pm_eptgen when the vcpu last ran */
diff --git a/sys/amd64/vmm/amd/svm_support.S b/sys/amd64/vmm/amd/svm_support.S
index 72327bd..b363101 100644
--- a/sys/amd64/vmm/amd/svm_support.S
+++ b/sys/amd64/vmm/amd/svm_support.S
@@ -22,6 +22,8 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
  */
 #include <machine/asmacros.h>
 
@@ -35,6 +37,10 @@
 #define	VENTER  push %rbp ; mov %rsp,%rbp
 #define	VLEAVE  pop %rbp
 
+#define	VMLOAD	.byte 0x0f, 0x01, 0xda
+#define	VMRUN	.byte 0x0f, 0x01, 0xd8
+#define	VMSAVE	.byte 0x0f, 0x01, 0xdb
+
 /*
  * svm_launch(uint64_t vmcb, struct svm_regctx *gctx)
  * %rdi: physical address of VMCB
@@ -79,9 +85,9 @@ ENTRY(svm_launch)
 	movq SCTX_RDI(%rsi), %rdi
 	movq SCTX_RSI(%rsi), %rsi	/* %rsi must be restored last */
 
-	vmload %rax
-	vmrun %rax
-	vmsave %rax
+	VMLOAD
+	VMRUN
+	VMSAVE
 
 	pop %rax		/* pop guest context pointer from the stack */
 
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index ae4d9db..5962526 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -342,18 +342,6 @@ vmcs_init(struct vmcs *vmcs)
 	 */
 	VMPTRLD(vmcs);
 
-	/* Initialize guest IA32_PAT MSR with the default value */
-	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
-	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
-	      PAT_VALUE(2, PAT_UNCACHED)	|
-	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
-	      PAT_VALUE(4, PAT_WRITE_BACK)	|
-	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
-	      PAT_VALUE(6, PAT_UNCACHED)	|
-	      PAT_VALUE(7, PAT_UNCACHEABLE);
-	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
-		goto done;
-
 	/* Host state */
 
 	/* Initialize host IA32_PAT MSR */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index c3dd04e..b81e48b 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -100,13 +100,11 @@ __FBSDID("$FreeBSD$");
 	(VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER			|			\
-	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
-	VM_EXIT_SAVE_PAT			|			\
-	VM_EXIT_LOAD_PAT)
+	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
 
 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
 
-#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT)
+#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER)
 
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
@@ -859,10 +857,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
-	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
-	 * and entry respectively. It is also restored from the host VMCS
-	 * area on a VM exit.
-	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as that
 	 * will impact the host TSC.
 	 * XXX Writes would be implemented with a wrmsr trap, and
@@ -874,7 +868,6 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
-	    guest_msr_rw(vmx, MSR_PAT) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
@@ -941,6 +934,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
+		vmx->state[i].nextrip = ~0;
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
@@ -1169,12 +1163,24 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 }
 
 static void
-vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
+vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
+    uint64_t guestrip)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
+	if (vmx->state[vcpu].nextrip != guestrip) {
+		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+		if (gi & HWINTR_BLOCKING) {
+			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
+			    "cleared due to rip change: %#lx/%#lx",
+			    vmx->state[vcpu].nextrip, guestrip);
+			gi &= ~HWINTR_BLOCKING;
+			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+		}
+	}
+
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
@@ -1771,7 +1777,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
-	
+
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
@@ -2060,12 +2066,11 @@ emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
-	int error, handled, in;
+	int error, errcode, errcode_valid, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
-	struct vm_exception vmexc;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, intr_vec, reason;
 	uint64_t exitintinfo, qual, gpa;
@@ -2250,6 +2255,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		vmexit->inst_length = 0;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
@@ -2376,15 +2382,15 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 
 		/* Reflect all other exceptions back into the guest */
-		bzero(&vmexc, sizeof(struct vm_exception));
-		vmexc.vector = intr_vec;
+		errcode_valid = errcode = 0;
 		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
-			vmexc.error_code_valid = 1;
-			vmexc.error_code = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
+			errcode_valid = 1;
+			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 		}
 		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
-		    "the guest", vmexc.vector, vmexc.error_code);
-		error = vm_inject_exception(vmx->vm, vcpu, &vmexc);
+		    "the guest", intr_vec, errcode);
+		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
+		    errcode_valid, errcode, 0);
 		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 		    __func__, error));
 		return (1);
@@ -2399,6 +2405,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		if (vm_mem_allocated(vmx->vm, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->inst_length = 0;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
@@ -2540,7 +2547,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 }
 
 static int
-vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
+vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
     void *rendezvous_cookie, void *suspend_cookie)
 {
 	int rc, handled, launched;
@@ -2550,7 +2557,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
-	uint64_t rip;
 	uint32_t exit_reason;
 
 	vmx = arg;
@@ -2578,11 +2584,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
-	vmcs_write(VMCS_GUEST_RIP, startrip);
+	vmcs_write(VMCS_GUEST_RIP, rip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
-		handled = UNHANDLED;
+		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
+		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 
+		handled = UNHANDLED;
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
@@ -2602,7 +2610,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
-		vmx_inject_interrupts(vmx, vcpu, vlapic);
+		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
@@ -2611,20 +2619,20 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
-			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
+			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
 			enable_intr();
-			vm_exit_rendezvous(vmx->vm, vcpu, vmcs_guest_rip());
+			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
-			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
-			vmx_astpending_trace(vmx, vcpu, vmexit->rip);
+			vm_exit_astpending(vmx->vm, vcpu, rip);
+			vmx_astpending_trace(vmx, vcpu, rip);
 			handled = HANDLED;
 			break;
 		}
@@ -2638,6 +2646,9 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
+		/* Update 'nextrip' */
+		vmx->state[vcpu].nextrip = rip;
+
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
@@ -2648,6 +2659,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+		rip = vmexit->rip;
 	} while (handled);
 
 	/*
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index 2124554..bc48861 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -78,6 +78,7 @@ struct vmxcap {
 };
 
 struct vmxstate {
+	uint64_t nextrip;	/* next instruction to be executed by guest */
 	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
 	uint16_t vpid;
 };
@@ -102,6 +103,7 @@ enum {
 	IDX_MSR_STAR,
 	IDX_MSR_SF_MASK,
 	IDX_MSR_KGSBASE,
+	IDX_MSR_PAT,
 	GUEST_MSR_NUM		/* must be the last enumeration */
 };
 
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index f6bbf2a..e517778 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -230,6 +230,25 @@ westmere_cpu(void)
 	return (false);
 }
 
+static bool
+pat_valid(uint64_t val)
+{
+	int i, pa;
+
+	/*
+	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
+	 *
+	 * Extract PA0 through PA7 and validate that each one encodes a
+	 * valid memory type.
+	 */
+	for (i = 0; i < 8; i++) {
+		pa = (val >> (i * 8)) & 0xff;
+		if (pa == 2 || pa == 3 || pa >= 8)
+			return (false);
+	}
+	return (true);
+}
+
 void
 vmx_msr_init(void)
 {
@@ -302,6 +321,10 @@ vmx_msr_init(void)
 void
 vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
 {
+	uint64_t *guest_msrs;
+
+	guest_msrs = vmx->guest_msrs[vcpuid];
+
 	/*
 	 * The permissions bitmap is shared between all vcpus so initialize it
 	 * once when initializing the vBSP.
@@ -313,6 +336,19 @@ vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
 		guest_msr_rw(vmx, MSR_SF_MASK);
 		guest_msr_rw(vmx, MSR_KGSBASE);
 	}
+
+	/*
+	 * Initialize guest IA32_PAT MSR with default value after reset.
+	 */
+	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
+	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(2, PAT_UNCACHED)		|
+	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	    PAT_VALUE(4, PAT_WRITE_BACK)	|
+	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(6, PAT_UNCACHED)		|
+	    PAT_VALUE(7, PAT_UNCACHEABLE);
+
 	return;
 }
 
@@ -353,7 +389,11 @@ vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
 int
 vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 {
-	int error = 0;
+	const uint64_t *guest_msrs;
+	int error;
+
+	guest_msrs = vmx->guest_msrs[vcpuid];
+	error = 0;
 
 	switch (num) {
 	case MSR_IA32_MISC_ENABLE:
@@ -366,6 +406,9 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 	case MSR_TURBO_RATIO_LIMIT1:
 		*val = turbo_ratio_limit;
 		break;
+	case MSR_PAT:
+		*val = guest_msrs[IDX_MSR_PAT];
+		break;
 	default:
 		error = EINVAL;
 		break;
@@ -376,10 +419,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 int
 vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
+	uint64_t *guest_msrs;
 	uint64_t changed;
 	int error;
 	
+	guest_msrs = vmx->guest_msrs[vcpuid];
 	error = 0;
+
 	switch (num) {
 	case MSR_IA32_MISC_ENABLE:
 		changed = val ^ misc_enable;
@@ -401,6 +447,12 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 			error = EINVAL;
 
 		break;
+	case MSR_PAT:
+		if (pat_valid(val))
+			guest_msrs[IDX_MSR_PAT] = val;
+		else
+			vm_inject_gp(vmx->vm, vcpuid);
+		break;
 	default:
 		error = EINVAL;
 		break;
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index 46e5ca7..a4c96cd 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -104,7 +104,6 @@ vhpet_capabilities(void)
 	uint64_t cap = 0;
 
 	cap |= 0x8086 << 16;			/* vendor id */
-	cap |= HPET_CAP_LEG_RT;			/* legacy routing capable */
 	cap |= (VHPET_NUM_TIMERS - 1) << 8;	/* number of timers */
 	cap |= 1;				/* revision */
 	cap &= ~HPET_CAP_COUNT_SIZE;		/* 32-bit timer */
@@ -127,15 +126,6 @@ vhpet_timer_msi_enabled(struct vhpet *vhpet, int n)
 {
 	const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN;
 
-	/*
-	 * LegacyReplacement Route configuration takes precedence over MSI
-	 * for timers 0 and 1.
-	 */
-	if (n == 0 || n == 1) {
-		if (vhpet->config & HPET_CNF_LEG_RT)
-			return (false);
-	}
-
 	if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable)
 		return (true);
 	else
@@ -152,41 +142,9 @@ vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n)
 	if (vhpet_timer_msi_enabled(vhpet, n))
 		return (0);
 
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		/*
-		 * In "legacy routing" timers 0 and 1 are connected to
-		 * ioapic pins 2 and 8 respectively.
-		 */
-		switch (n) {
-		case 0:
-			return (2);
-		case 1:
-			return (8);
-		}
-	}
-
 	return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9);
 }
 
-static __inline int
-vhpet_timer_atpic_pin(struct vhpet *vhpet, int n)
-{
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		/*
-		 * In "legacy routing" timers 0 and 1 are connected to
-		 * 8259 master pin 0 and slave pin 0 respectively.
-		 */
-		switch (n) {
-		case 0:
-			return (0);
-		case 1:
-			return (8);
-		}
-	}
-
-	return (-1);
-}
-
 static uint32_t
 vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 {
@@ -216,17 +174,12 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 static void
 vhpet_timer_clear_isr(struct vhpet *vhpet, int n)
 {
-	int pin, legacy_pin;
+	int pin;
 
 	if (vhpet->isr & (1 << n)) {
 		pin = vhpet_timer_ioapic_pin(vhpet, n);
 		KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n));
 		vioapic_deassert_irq(vhpet->vm, pin);
-
-		legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
-		if (legacy_pin != -1)
-			vatpic_deassert_irq(vhpet->vm, legacy_pin);
-
 		vhpet->isr &= ~(1 << n);
 	}
 }
@@ -252,12 +205,6 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 	KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: "
 	    "timer %d is using MSI", n));
 
-	/* The legacy replacement interrupts are always edge triggered */
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		if (n == 0 || n == 1)
-			return (true);
-	}
-
 	if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0)
 		return (true);
 	else
@@ -267,7 +214,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 static void
 vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 {
-	int pin, legacy_pin;
+	int pin;
 
 	/* If interrupts are not enabled for this timer then just return. */
 	if (!vhpet_timer_interrupt_enabled(vhpet, n))
@@ -293,17 +240,11 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 		return;
 	}
 
-	legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
-
 	if (vhpet_timer_edge_trig(vhpet, n)) {
 		vioapic_pulse_irq(vhpet->vm, pin);
-		if (legacy_pin != -1)
-			vatpic_pulse_irq(vhpet->vm, legacy_pin);
 	} else {
 		vhpet->isr |= 1 << n;
 		vioapic_assert_irq(vhpet->vm, pin);
-		if (legacy_pin != -1)
-			vatpic_assert_irq(vhpet->vm, legacy_pin);
 	}
 }
 
@@ -579,6 +520,13 @@ vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size,
 		counter = vhpet_counter(vhpet, nowptr);
 		oldval = vhpet->config;
 		update_register(&vhpet->config, data, mask);
+
+		/*
+		 * LegacyReplacement Routing is not supported so clear the
+		 * bit explicitly.
+		 */
+		vhpet->config &= ~HPET_CNF_LEG_RT;
+
 		if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) {
 			if (vhpet_counter_enabled(vhpet)) {
 				vhpet_start_counting(vhpet);
diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c
new file mode 100644
index 0000000..d5e93dc
--- /dev/null
+++ b/sys/amd64/vmm/io/vrtc.c
@@ -0,0 +1,952 @@
+/*-
+ * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+
+#include <machine/vmm.h>
+
+#include <isa/rtc.h>
+
+#include "vmm_ktr.h"
+#include "vatpic.h"
+#include "vioapic.h"
+#include "vrtc.h"
+
+/* Register layout of the RTC */
+struct rtcdev {
+	uint8_t	sec;
+	uint8_t	alarm_sec;
+	uint8_t	min;
+	uint8_t	alarm_min;
+	uint8_t	hour;
+	uint8_t	alarm_hour;
+	uint8_t	day_of_week;
+	uint8_t	day_of_month;
+	uint8_t	month;
+	uint8_t	year;
+	uint8_t	reg_a;
+	uint8_t	reg_b;
+	uint8_t	reg_c;
+	uint8_t	reg_d;
+	uint8_t	nvram[128 - 14];
+} __packed;
+CTASSERT(sizeof(struct rtcdev) == 128);
+
+struct vrtc {
+	struct vm	*vm;
+	struct mtx	mtx;
+	struct callout	callout;
+	u_int		addr;		/* RTC register to read or write */
+	sbintime_t	base_uptime;
+	time_t		base_rtctime;
+	struct rtcdev	rtcdev;
+};
+
+#define	VRTC_LOCK(vrtc)		mtx_lock(&((vrtc)->mtx))
+#define	VRTC_UNLOCK(vrtc)	mtx_unlock(&((vrtc)->mtx))
+#define	VRTC_LOCKED(vrtc)	mtx_owned(&((vrtc)->mtx))
+
+/*
+ * RTC time is considered "broken" if:
+ * - RTC updates are halted by the guest
+ * - RTC date/time fields have invalid values
+ */
+#define	VRTC_BROKEN_TIME	((time_t)-1)
+
+#define	RTC_IRQ			8
+#define	RTCSB_BIN		0x04
+#define	RTCSB_ALL_INTRS		(RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR)
+#define	rtc_halted(vrtc)	((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0)
+#define	aintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0)
+#define	pintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0)
+#define	uintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0)
+
+static void vrtc_callout_handler(void *arg);
+static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval);
+
+static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc");
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL);
+
+static int rtc_flag_broken_time = 1;
+SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN,
+    &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected");
+
+static __inline bool
+divider_enabled(int reg_a)
+{
+	/*
+	 * The RTC is counting only when dividers are not held in reset.
+	 */
+	return ((reg_a & 0x70) == 0x20);
+}
+
+static __inline bool
+update_enabled(struct vrtc *vrtc)
+{
+	/*
+	 * RTC date/time can be updated only if:
+	 * - divider is not held in reset
+	 * - guest has not disabled updates
+	 * - the date/time fields have valid contents
+	 */
+	if (!divider_enabled(vrtc->rtcdev.reg_a))
+		return (false);
+
+	if (rtc_halted(vrtc))
+		return (false);
+
+	if (vrtc->base_rtctime == VRTC_BROKEN_TIME)
+		return (false);
+
+	return (true);
+}
+
+static time_t
+vrtc_curtime(struct vrtc *vrtc)
+{
+	sbintime_t now, delta;
+	time_t t;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	t = vrtc->base_rtctime;
+	if (update_enabled(vrtc)) {
+		now = sbinuptime();
+		delta = now - vrtc->base_uptime;
+		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
+		    "%#lx to %#lx", vrtc->base_uptime, now));
+		t += delta / SBT_1S;
+	}
+	return (t);
+}
+
+static __inline uint8_t
+rtcset(struct rtcdev *rtc, int val)
+{
+
+	KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d",
+	    __func__, val));
+
+	return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]);
+}
+
+static void
+secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update)
+{
+	struct clocktime ct;
+	struct timespec ts;
+	struct rtcdev *rtc;
+	int hour;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	if (rtctime < 0) {
+		KASSERT(rtctime == VRTC_BROKEN_TIME,
+		    ("%s: invalid vrtc time %#lx", __func__, rtctime));
+		return;
+	}
+
+	/*
+	 * If the RTC is halted then the guest has "ownership" of the
+	 * date/time fields. Don't update the RTC date/time fields in
+	 * this case (unless forced).
+	 */
+	if (rtc_halted(vrtc) && !force_update)
+		return;
+
+	ts.tv_sec = rtctime;
+	ts.tv_nsec = 0;
+	clock_ts_to_ct(&ts, &ct);
+
+	KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d",
+	    ct.sec));
+	KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d",
+	    ct.min));
+	KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d",
+	    ct.hour));
+	KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d",
+	    ct.dow));
+	KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d",
+	    ct.day));
+	KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d",
+	    ct.mon));
+	KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d",
+	    ct.year));
+
+	rtc = &vrtc->rtcdev;
+	rtc->sec = rtcset(rtc, ct.sec);
+	rtc->min = rtcset(rtc, ct.min);
+
+	hour = ct.hour;
+	if ((rtc->reg_b & RTCSB_24HR) == 0)
+		hour = (hour % 12) + 1;	    /* convert to a 12-hour format */
+
+	rtc->hour = rtcset(rtc, hour);
+
+	if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12)
+		rtc->hour |= 0x80;	    /* set MSB to indicate PM */
+
+	rtc->day_of_week = rtcset(rtc, ct.dow + 1);
+	rtc->day_of_month = rtcset(rtc, ct.day);
+	rtc->month = rtcset(rtc, ct.mon);
+	rtc->year = rtcset(rtc, ct.year % 100);
+}
+
+static int
+rtcget(struct rtcdev *rtc, int val, int *retval)
+{
+	uint8_t upper, lower;
+
+	if (rtc->reg_b & RTCSB_BIN) {
+		*retval = val;
+		return (0);
+	}
+
+	lower = val & 0xf;
+	upper = (val >> 4) & 0xf;
+
+	if (lower > 9 || upper > 9)
+		return (-1);
+
+	*retval = upper * 10 + lower;
+	return (0);
+}
+
+static time_t
+rtc_to_secs(struct vrtc *vrtc)
+{
+	struct clocktime ct;
+	struct timespec ts;
+	struct rtcdev *rtc;
+	struct vm *vm;
+	int error, hour, pm, year;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	vm = vrtc->vm;
+	rtc = &vrtc->rtcdev;
+
+	bzero(&ct, sizeof(struct clocktime));
+
+	error = rtcget(rtc, rtc->sec, &ct.sec);
+	if (error || ct.sec < 0 || ct.sec > 59) {
+		VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec);
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->min, &ct.min);
+	if (error || ct.min < 0 || ct.min > 59) {
+		VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min);
+		goto fail;
+	}
+
+	pm = 0;
+	hour = rtc->hour;
+	if ((rtc->reg_b & RTCSB_24HR) == 0) {
+		if (hour & 0x80) {
+			hour &= ~0x80;
+			pm = 1;
+		}
+	}
+	error = rtcget(rtc, hour, &ct.hour);
+	if ((rtc->reg_b & RTCSB_24HR) == 0) {
+		ct.hour -= 1;
+		if (pm)
+			ct.hour += 12;
+	}
+
+	if (error || ct.hour < 0 || ct.hour > 23) {
+		VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour);
+		goto fail;
+	}
+
+	/*
+	 * Ignore 'rtc->dow' because some guests like Linux don't bother
+	 * setting it at all while others like OpenBSD/i386 set it incorrectly. 
+	 *
+	 * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it.
+	 */
+	ct.dow = -1;
+
+	error = rtcget(rtc, rtc->day_of_month, &ct.day);
+	if (error || ct.day < 1 || ct.day > 31) {
+		VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month,
+		    ct.day);
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->month, &ct.mon);
+	if (error || ct.mon < 1 || ct.mon > 12) {
+		VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon);
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->year, &year);
+	if (error || year < 0 || year > 99) {
+		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
+		goto fail;
+	}
+	if (year >= 70)
+		ct.year = 1900 + year;
+	else
+		ct.year = 2000 + year;
+
+	error = clock_ct_to_ts(&ct, &ts);
+	if (error || ts.tv_sec < 0) {
+		VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d",
+		    ct.year, ct.mon, ct.day);
+		VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d",
+		    ct.hour, ct.min, ct.sec);
+		goto fail;
+	}
+	return (ts.tv_sec);		/* success */
+fail:
+	return (VRTC_BROKEN_TIME);	/* failure */
+}
+
+static int
+vrtc_time_update(struct vrtc *vrtc, time_t newtime)
+{
+	struct rtcdev *rtc;
+	time_t oldtime;
+	uint8_t alarm_sec, alarm_min, alarm_hour;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	alarm_sec = rtc->alarm_sec;
+	alarm_min = rtc->alarm_min;
+	alarm_hour = rtc->alarm_hour;
+
+	oldtime = vrtc->base_rtctime;
+	VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx",
+	    oldtime, newtime);
+
+	if (newtime == oldtime)
+		return (0);
+
+	/*
+	 * If 'newtime' indicates that RTC updates are disabled then just
+	 * record that and return. There is no need to do alarm interrupt
+	 * processing or update 'base_uptime' in this case.
+	 */
+	if (newtime == VRTC_BROKEN_TIME) {
+		vrtc->base_rtctime = VRTC_BROKEN_TIME;
+		return (0);
+	}
+
+	/*
+	 * Return an error if RTC updates are halted by the guest.
+	 */
+	if (rtc_halted(vrtc)) {
+		VM_CTR0(vrtc->vm, "RTC update halted by guest");
+		return (EBUSY);
+	}
+
+	do {
+		/*
+		 * If the alarm interrupt is enabled and 'oldtime' is valid
+		 * then visit all the seconds between 'oldtime' and 'newtime'
+		 * to check for the alarm condition.
+		 *
+		 * Otherwise move the RTC time forward directly to 'newtime'.
+		 */
+		if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME)
+			vrtc->base_rtctime++;
+		else
+			vrtc->base_rtctime = newtime;
+
+		if (aintr_enabled(vrtc)) {
+			/*
+			 * Update the RTC date/time fields before checking
+			 * if the alarm conditions are satisfied.
+			 */
+			secs_to_rtc(vrtc->base_rtctime, vrtc, 0);
+
+			if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) &&
+			    (alarm_min >= 0xC0 || alarm_min == rtc->min) &&
+			    (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) {
+				vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM);
+			}
+		}
+	} while (vrtc->base_rtctime != newtime);
+
+	if (uintr_enabled(vrtc))
+		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
+
+	vrtc->base_uptime = sbinuptime();
+
+	return (0);
+}
+
+static sbintime_t
+vrtc_freq(struct vrtc *vrtc)
+{
+	int ratesel;
+
+	static sbintime_t pf[16] = {
+		0,
+		SBT_1S / 256,
+		SBT_1S / 128,
+		SBT_1S / 8192,
+		SBT_1S / 4096,
+		SBT_1S / 2048,
+		SBT_1S / 1024,
+		SBT_1S / 512,
+		SBT_1S / 256,
+		SBT_1S / 128,
+		SBT_1S / 64,
+		SBT_1S / 32,
+		SBT_1S / 16,
+		SBT_1S / 8,
+		SBT_1S / 4,
+		SBT_1S / 2,
+	};
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	/*
+	 * If both periodic and alarm interrupts are enabled then use the
+	 * periodic frequency to drive the callout. The minimum periodic
+	 * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so
+	 * piggyback the alarm on top of it. The same argument applies to
+	 * the update interrupt.
+	 */
+	if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) {
+		ratesel = vrtc->rtcdev.reg_a & 0xf;
+		return (pf[ratesel]);
+	} else if (aintr_enabled(vrtc) && update_enabled(vrtc)) {
+		return (SBT_1S);
+	} else if (uintr_enabled(vrtc) && update_enabled(vrtc)) {
+		return (SBT_1S);
+	} else {
+		return (0);
+	}
+}
+
+static void
+vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt)
+{
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	if (freqsbt == 0) {
+		if (callout_active(&vrtc->callout)) {
+			VM_CTR0(vrtc->vm, "RTC callout stopped");
+			callout_stop(&vrtc->callout);
+		}
+		return;
+	}
+	VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt);
+	callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler,
+	    vrtc, 0);
+}
+
+static void
+vrtc_callout_handler(void *arg)
+{
+	struct vrtc *vrtc = arg;
+	sbintime_t freqsbt;
+	time_t rtctime;
+	int error;
+
+	VM_CTR0(vrtc->vm, "vrtc callout fired");
+
+	VRTC_LOCK(vrtc);
+	if (callout_pending(&vrtc->callout))	/* callout was reset */
+		goto done;
+
+	if (!callout_active(&vrtc->callout))	/* callout was stopped */
+		goto done;
+
+	callout_deactivate(&vrtc->callout);
+
+	KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0,
+	    ("gratuitous vrtc callout"));
+
+	if (pintr_enabled(vrtc))
+		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
+
+	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
+		rtctime = vrtc_curtime(vrtc);
+		error = vrtc_time_update(vrtc, rtctime);
+		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
+		    __func__, error));
+	}
+
+	freqsbt = vrtc_freq(vrtc);
+	KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__));
+	vrtc_callout_reset(vrtc, freqsbt);
+done:
+	VRTC_UNLOCK(vrtc);
+}
+
+static __inline void
+vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq)
+{
+	int active;
+
+	active = callout_active(&vrtc->callout) ? 1 : 0;
+	KASSERT((freq == 0 && !active) || (freq != 0 && active),
+	    ("vrtc callout %s with frequency %#lx",
+	    active ? "active" : "inactive", freq));
+}
+
+static void
+vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval)
+{
+	struct rtcdev *rtc;
+	int oldirqf, newirqf;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE;
+
+	oldirqf = rtc->reg_c & RTCIR_INT;
+	if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) ||
+	    (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) ||
+	    (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) {
+		newirqf = RTCIR_INT;
+	} else {
+		newirqf = 0;
+	}
+
+	oldval = rtc->reg_c;
+	rtc->reg_c = newirqf | newval;
+	changed = oldval ^ rtc->reg_c;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x",
+		    oldval, rtc->reg_c);
+	}
+
+	if (!oldirqf && newirqf) {
+		VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ);
+		vatpic_pulse_irq(vrtc->vm, RTC_IRQ);
+		vioapic_pulse_irq(vrtc->vm, RTC_IRQ);
+	} else if (oldirqf && !newirqf) {
+		VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ);
+	}
+}
+
+static int
+vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
+{
+	struct rtcdev *rtc;
+	sbintime_t oldfreq, newfreq;
+	time_t curtime, rtctime;
+	int error;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	oldval = rtc->reg_b;
+	oldfreq = vrtc_freq(vrtc);
+
+	rtc->reg_b = newval;
+	changed = oldval ^ newval;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x",
+		    oldval, newval);
+	}
+
+	if (changed & RTCSB_HALT) {
+		if ((newval & RTCSB_HALT) == 0) {
+			rtctime = rtc_to_secs(vrtc);
+			if (rtctime == VRTC_BROKEN_TIME) {
+				/*
+				 * Stop updating the RTC if the date/time
+				 * programmed by the guest is not correct.
+				 */
+				VM_CTR0(vrtc->vm, "Invalid RTC date/time "
+				    "programming detected");
+
+				if (rtc_flag_broken_time)
+					return (-1);
+			}
+		} else {
+			curtime = vrtc_curtime(vrtc);
+			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
+			    "between vrtc basetime (%#lx) and curtime (%#lx)",
+			    __func__, vrtc->base_rtctime, curtime));
+
+			/*
+			 * Force a refresh of the RTC date/time fields so
+			 * they reflect the time right before the guest set
+			 * the HALT bit.
+			 */
+			secs_to_rtc(curtime, vrtc, 1);
+
+			/*
+			 * Updates are halted so mark 'base_rtctime' to denote
+			 * that the RTC date/time is in flux.
+			 */
+			rtctime = VRTC_BROKEN_TIME;
+			rtc->reg_b &= ~RTCSB_UINTR;
+		}
+		error = vrtc_time_update(vrtc, rtctime);
+		KASSERT(error == 0, ("vrtc_time_update error %d", error));
+	}
+
+	/*
+	 * Side effect of changes to the interrupt enable bits.
+	 */
+	if (changed & RTCSB_ALL_INTRS)
+		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c);
+
+	/*
+	 * Change the callout frequency if it has changed.
+	 */
+	newfreq = vrtc_freq(vrtc);
+	if (newfreq != oldfreq)
+		vrtc_callout_reset(vrtc, newfreq);
+	else
+		vrtc_callout_check(vrtc, newfreq);
+
+	/*
+	 * The side effect of bits that control the RTC date/time format
+	 * is handled lazily when those fields are actually read.
+	 */
+	return (0);
+}
+
+static void
+vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval)
+{
+	sbintime_t oldfreq, newfreq;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	newval &= ~RTCSA_TUP;
+	oldval = vrtc->rtcdev.reg_a;
+	oldfreq = vrtc_freq(vrtc);
+
+	if (divider_enabled(oldval) && !divider_enabled(newval)) {
+		VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx",
+		    vrtc->base_rtctime, vrtc->base_uptime);
+	} else if (!divider_enabled(oldval) && divider_enabled(newval)) {
+		/*
+		 * If the dividers are coming out of reset then update
+		 * 'base_uptime' before this happens. This is done to
+		 * maintain the illusion that the RTC date/time was frozen
+		 * while the dividers were disabled.
+		 */
+		vrtc->base_uptime = sbinuptime();
+		VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx",
+		    vrtc->base_rtctime, vrtc->base_uptime);
+	} else {
+		/* NOTHING */
+	}
+
+	vrtc->rtcdev.reg_a = newval;
+	changed = oldval ^ newval;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x",
+		    oldval, newval);
+	}
+
+	/*
+	 * Side effect of changes to rate select and divider enable bits.
+	 */
+	newfreq = vrtc_freq(vrtc);
+	if (newfreq != oldfreq)
+		vrtc_callout_reset(vrtc, newfreq);
+	else
+		vrtc_callout_check(vrtc, newfreq);
+}
+
+int
+vrtc_set_time(struct vm *vm, time_t secs)
+{
+	struct vrtc *vrtc;
+	int error;
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+	error = vrtc_time_update(vrtc, secs);
+	VRTC_UNLOCK(vrtc);
+
+	if (error) {
+		VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error,
+		    secs);
+	} else {
+		VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs);
+	}
+
+	return (error);
+}
+
+time_t
+vrtc_get_time(struct vm *vm)
+{
+	struct vrtc *vrtc;
+	time_t t;
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+	t = vrtc_curtime(vrtc);
+	VRTC_UNLOCK(vrtc);
+
+	return (t);
+}
+
+int
+vrtc_nvram_write(struct vm *vm, int offset, uint8_t value)
+{
+	struct vrtc *vrtc;
+	uint8_t *ptr;
+
+	vrtc = vm_rtc(vm);
+
+	/*
+	 * Don't allow writes to RTC control registers or the date/time fields.
+	 */
+	if (offset < offsetof(struct rtcdev, nvram[0]) ||
+	    offset >= sizeof(struct rtcdev)) {
+		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
+		    offset);
+		return (EINVAL);
+	}
+
+	VRTC_LOCK(vrtc);
+	ptr = (uint8_t *)(&vrtc->rtcdev);
+	ptr[offset] = value;
+	VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset);
+	VRTC_UNLOCK(vrtc);
+
+	return (0);
+}
+
+int
+vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
+{
+	struct vrtc *vrtc;
+	time_t curtime;
+	uint8_t *ptr;
+
+	/*
+	 * Allow all offsets in the RTC to be read.
+	 */
+	if (offset < 0 || offset >= sizeof(struct rtcdev))
+		return (EINVAL);
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+
+	/*
+	 * Update RTC date/time fields if necessary.
+	 */
+	if (offset < 10) {
+		curtime = vrtc_curtime(vrtc);
+		secs_to_rtc(curtime, vrtc, 0);
+	}
+
+	ptr = (uint8_t *)(&vrtc->rtcdev);
+	*retval = ptr[offset];
+
+	VRTC_UNLOCK(vrtc);
+	return (0);
+}
+
+int
+vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	struct vrtc *vrtc;
+
+	vrtc = vm_rtc(vm);
+
+	if (bytes != 1)
+		return (-1);
+
+	if (in) {
+		*val = 0xff;
+		return (0);
+	}
+
+	VRTC_LOCK(vrtc);
+	vrtc->addr = *val & 0x7f;
+	VRTC_UNLOCK(vrtc);
+
+	return (0);
+}
+
+int
+vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	struct vrtc *vrtc;
+	struct rtcdev *rtc;
+	time_t curtime;
+	int error, offset;
+
+	vrtc = vm_rtc(vm);
+	rtc = &vrtc->rtcdev;
+
+	if (bytes != 1)
+		return (-1);
+
+	VRTC_LOCK(vrtc);
+	offset = vrtc->addr;
+	if (offset >= sizeof(struct rtcdev)) {
+		VRTC_UNLOCK(vrtc);
+		return (-1);
+	}
+
+	error = 0;
+	curtime = vrtc_curtime(vrtc);
+	vrtc_time_update(vrtc, curtime);
+
+	if (in) {
+		/*
+		 * Update RTC date/time fields if necessary.
+		 */
+		if (offset < 10)
+			secs_to_rtc(curtime, vrtc, 0);
+
+		if (offset == 12) {
+			/*
+			 * XXX
+			 * reg_c interrupt flags are updated only if the
+			 * corresponding interrupt enable bit in reg_b is set.
+			 */
+			*val = vrtc->rtcdev.reg_c;
+			vrtc_set_reg_c(vrtc, 0);
+		} else {
+			*val = *((uint8_t *)rtc + offset);
+		}
+		VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x",
+		    *val, offset);
+	} else {
+		switch (offset) {
+		case 10:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val);
+			vrtc_set_reg_a(vrtc, *val);
+			break;
+		case 11:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val);
+			error = vrtc_set_reg_b(vrtc, *val);
+			break;
+		case 12:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)",
+			    *val);
+			break;
+		case 13:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)",
+			    *val);
+			break;
+		case 0:
+			/*
+			 * High order bit of 'seconds' is readonly.
+			 */
+			*val &= 0x7f;
+			/* FALLTHRU */
+		default:
+			VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x",
+			    offset, *val);
+			*((uint8_t *)rtc + offset) = *val;
+			break;
+		}
+	}
+	VRTC_UNLOCK(vrtc);
+	return (error);
+}
+
+void
+vrtc_reset(struct vrtc *vrtc)
+{
+	struct rtcdev *rtc;
+
+	VRTC_LOCK(vrtc);
+
+	rtc = &vrtc->rtcdev;
+	vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE));
+	vrtc_set_reg_c(vrtc, 0);
+	KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active"));
+
+	VRTC_UNLOCK(vrtc);
+}
+
+struct vrtc *
+vrtc_init(struct vm *vm)
+{
+	struct vrtc *vrtc;
+	struct rtcdev *rtc;
+	time_t curtime;
+
+	vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO);
+	vrtc->vm = vm;
+	mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF);
+	callout_init(&vrtc->callout, 1);
+
+	/* Allow dividers to keep time but disable everything else */
+	rtc = &vrtc->rtcdev;
+	rtc->reg_a = 0x20;
+	rtc->reg_b = RTCSB_24HR;
+	rtc->reg_c = 0;
+	rtc->reg_d = RTCSD_PWR;
+
+	/* Reset the index register to a safe value. */
+	vrtc->addr = RTC_STATUSD;
+
+	/*
+	 * Initialize RTC time to 00:00:00 Jan 1, 1970.
+	 */
+	curtime = 0;
+
+	VRTC_LOCK(vrtc);
+	vrtc->base_rtctime = VRTC_BROKEN_TIME;
+	vrtc_time_update(vrtc, curtime);
+	secs_to_rtc(curtime, vrtc, 0);
+	VRTC_UNLOCK(vrtc);
+
+	return (vrtc);
+}
+
+void
+vrtc_cleanup(struct vrtc *vrtc)
+{
+
+	callout_drain(&vrtc->callout);
+	free(vrtc, M_VRTC);
+}
diff --git a/sys/amd64/vmm/io/vrtc.h b/sys/amd64/vmm/io/vrtc.h
new file mode 100644
index 0000000..6fbbc9c
--- /dev/null
+++ b/sys/amd64/vmm/io/vrtc.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VRTC_H_
+#define	_VRTC_H_
+
+#include <isa/isareg.h>
+
+struct vrtc;
+
+struct vrtc *vrtc_init(struct vm *vm);
+void vrtc_cleanup(struct vrtc *vrtc);
+void vrtc_reset(struct vrtc *vrtc);
+
+time_t vrtc_get_time(struct vm *vm);
+int vrtc_set_time(struct vm *vm, time_t secs);
+int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value);
+int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval);
+
+int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val);
+int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val);
+
+#endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 4739a86..7f90c61 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$");
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vpmtmr.h"
+#include "vrtc.h"
 #include "vmm_ipi.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
@@ -100,12 +101,15 @@ struct vcpu {
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
-	struct vm_exception exception;	/* (x) exception collateral */
 	int	exception_pending;	/* (i) exception pending */
+	int	exc_vector;		/* (x) exception collateral */
+	int	exc_errcode_valid;
+	uint32_t exc_errcode;
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
+	uint64_t	nextrip;	/* (x) next instruction to execute */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
@@ -136,6 +140,7 @@ struct vm {
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
+	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
@@ -376,6 +381,8 @@ vm_init(struct vm *vm, bool create)
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 	vm->vpmtmr = vpmtmr_init(vm);
+	if (create)
+		vm->vrtc = vrtc_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 
@@ -438,6 +445,10 @@ vm_cleanup(struct vm *vm, bool destroy)
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
+	if (destroy)
+		vrtc_cleanup(vm->vrtc);
+	else
+		vrtc_reset(vm->vrtc);
 	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
@@ -841,16 +852,26 @@ vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 }
 
 int
-vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
+	struct vcpu *vcpu;
+	int error;
 
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
-	return (VMSETREG(vm->cookie, vcpu, reg, val));
+	error = VMSETREG(vm->cookie, vcpuid, reg, val);
+	if (error || reg != VM_REG_GUEST_RIP)
+		return (error);
+
+	/* Set 'nextrip' to match the value of %rip */
+	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
+	vcpu = &vm->vcpu[vcpuid];
+	vcpu->nextrip = val;
+	return (0);
 }
 
 static boolean_t
@@ -1102,7 +1123,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
-	int error, t, vcpu_halted, vm_halted;
+	int t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
@@ -1110,22 +1131,6 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 	vcpu_halted = 0;
 	vm_halted = 0;
 
-	/*
-	 * The typical way to halt a cpu is to execute: "sti; hlt"
-	 *
-	 * STI sets RFLAGS.IF to enable interrupts. However, the processor
-	 * remains in an "interrupt shadow" for an additional instruction
-	 * following the STI. This guarantees that "sti; hlt" sequence is
-	 * atomic and a pending interrupt will be recognized after the HLT.
-	 *
-	 * After the HLT emulation is done the vcpu is no longer in an
-	 * interrupt shadow and a pending interrupt can be injected on
-	 * the next entry into the guest.
-	 */
-	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
-	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
-	    __func__, error));
-
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
@@ -1206,6 +1211,9 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
+	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
+	    __func__, vme->inst_length));
+
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
@@ -1231,9 +1239,6 @@ vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
-	/* restart execution at the faulting instruction */
-	vme->inst_length = 0;
-
 	return (0);
 }
 
@@ -1288,10 +1293,13 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 		return (EFAULT);
 
 	/*
-	 * If the instruction length is not specified the update it now.
+	 * If the instruction length was not specified then update it now
+	 * along with 'nextrip'.
 	 */
-	if (vme->inst_length == 0)
+	if (vme->inst_length == 0) {
 		vme->inst_length = vie->num_processed;
+		vcpu->nextrip += vie->num_processed;
+	}
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
@@ -1440,7 +1448,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
-	uint64_t tscval, rip;
+	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
@@ -1462,7 +1470,6 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
-	rip = vmrun->rip;
 restart:
 	critical_enter();
 
@@ -1477,7 +1484,7 @@ restart:
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
-	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
+	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
@@ -1488,6 +1495,7 @@ restart:
 
 	if (error == 0) {
 		retu = false;
+		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
@@ -1524,10 +1532,8 @@ restart:
 		}
 	}
 
-	if (error == 0 && retu == false) {
-		rip = vme->rip + vme->inst_length;
+	if (error == 0 && retu == false)
 		goto restart;
-	}
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
@@ -1535,6 +1541,49 @@ restart:
 }
 
 int
+vm_restart_instruction(void *arg, int vcpuid)
+{
+	struct vm *vm;
+	struct vcpu *vcpu;
+	enum vcpu_state state;
+	uint64_t rip;
+	int error;
+
+	vm = arg;
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	state = vcpu_get_state(vm, vcpuid, NULL);
+	if (state == VCPU_RUNNING) {
+		/*
+		 * When a vcpu is "running" the next instruction is determined
+		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
+		 * Thus setting 'inst_length' to zero will cause the current
+		 * instruction to be restarted.
+		 */
+		vcpu->exitinfo.inst_length = 0;
+		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
+		    "setting inst_length to zero", vcpu->exitinfo.rip);
+	} else if (state == VCPU_FROZEN) {
+		/*
+		 * When a vcpu is "frozen" it is outside the critical section
+		 * around VMRUN() and 'nextrip' points to the next instruction.
+		 * Thus instruction restart is achieved by setting 'nextrip'
+		 * to the vcpu's %rip.
+		 */
+		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
+		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
+		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
+		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
+		vcpu->nextrip = rip;
+	} else {
+		panic("%s: invalid state %d", __func__, state);
+	}
+	return (0);
+}
+
+int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
@@ -1664,11 +1713,11 @@ vcpu_exception_intinfo(struct vcpu *vcpu)
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
-		info = vcpu->exception.vector & 0xff;
+		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
-		if (vcpu->exception.error_code_valid) {
+		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
-			info |= (uint64_t)vcpu->exception.error_code << 32;
+			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
@@ -1693,7 +1742,7 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
-		    vcpu->exception.vector, info2);
+		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
@@ -1731,14 +1780,16 @@ vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 }
 
 int
-vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
+vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
+    uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
+	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
-	if (exception->vector < 0 || exception->vector >= 32)
+	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
 	/*
@@ -1746,21 +1797,35 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
-	if (exception->vector == IDT_DF)
+	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
-		    "pending exception %d", exception->vector,
-		    vcpu->exception.vector);
+		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
+	/*
+	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
+	 *
+	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
+	 * one instruction or incurs an exception.
+	 */
+	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
+	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
+	    __func__, error));
+
+	if (restart_instruction)
+		vm_restart_instruction(vm, vcpuid);
+
 	vcpu->exception_pending = 1;
-	vcpu->exception = *exception;
-	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
+	vcpu->exc_vector = vector;
+	vcpu->exc_errcode = errcode;
+	vcpu->exc_errcode_valid = errcode_valid;
+	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
 	return (0);
 }
 
@@ -1768,28 +1833,15 @@ void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
-	struct vm_exception exception;
-	struct vm_exit *vmexit;
 	struct vm *vm;
-	int error;
+	int error, restart_instruction;
 
 	vm = vmarg;
+	restart_instruction = 1;
 
-	exception.vector = vector;
-	exception.error_code = errcode;
-	exception.error_code_valid = errcode_valid;
-	error = vm_inject_exception(vm, vcpuid, &exception);
+	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
+	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
-
-	/*
-	 * A fault-like exception allows the instruction to be restarted
-	 * after the exception handler returns.
-	 *
-	 * By setting the inst_length to 0 we ensure that the instruction
-	 * pointer remains at the faulting instruction.
-	 */
-	vmexit = vm_exitinfo(vm, vcpuid);
-	vmexit->inst_length = 0;
 }
 
 void
@@ -2223,6 +2275,13 @@ vm_pmtmr(struct vm *vm)
 	return (vm->vpmtmr);
 }
 
+struct vrtc *
+vm_rtc(struct vm *vm)
+{
+
+	return (vm->vrtc);
+}
+
 enum vm_reg_name
 vm_segment_name(int seg)
 {
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index a85109e..0293d191 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include "io/vatpic.h"
 #include "io/vioapic.h"
 #include "io/vhpet.h"
+#include "io/vrtc.h"
 
 struct vmmdev_softc {
 	struct vm	*vm;		/* vm instance cookie */
@@ -174,6 +175,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
 	struct vm_intinfo *vmii;
+	struct vm_rtc_time *rtctime;
+	struct vm_rtc_data *rtcdata;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -202,6 +205,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_ACTIVATE_CPU:
 	case VM_SET_INTINFO:
 	case VM_GET_INTINFO:
+	case VM_RESTART_INSTRUCTION:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -307,7 +311,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		break;
 	case VM_INJECT_EXCEPTION:
 		vmexc = (struct vm_exception *)data;
-		error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc);
+		error = vm_inject_exception(sc->vm, vmexc->cpuid,
+		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
+		    vmexc->restart_instruction);
 		break;
 	case VM_INJECT_NMI:
 		vmnmi = (struct vm_nmi *)data;
@@ -482,6 +488,28 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
 		    &vmii->info2);
 		break;
+	case VM_RTC_WRITE:
+		rtcdata = (struct vm_rtc_data *)data;
+		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
+		    rtcdata->value);
+		break;
+	case VM_RTC_READ:
+		rtcdata = (struct vm_rtc_data *)data;
+		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
+		    &rtcdata->value);
+		break;
+	case VM_RTC_SETTIME:
+		rtctime = (struct vm_rtc_time *)data;
+		error = vrtc_set_time(sc->vm, rtctime->secs);
+		break;
+	case VM_RTC_GETTIME:
+		error = 0;
+		rtctime = (struct vm_rtc_time *)data;
+		rtctime->secs = vrtc_get_time(sc->vm);
+		break;
+	case VM_RESTART_INSTRUCTION:
+		error = vm_restart_instruction(sc->vm, vcpu);
+		break;
 	default:
 		error = ENOTTY;
 		break;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index d1d7173..3db890e 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -70,6 +70,7 @@ enum {
 	VIE_OP_TYPE_PUSH,
 	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_POP,
+	VIE_OP_TYPE_MOVS,
 	VIE_OP_TYPE_LAST
 };
 
@@ -78,6 +79,7 @@ enum {
 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
 #define	VIE_OP_F_NO_MODRM	(1 << 3)
+#define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
@@ -133,6 +135,16 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
+	[0xA4] = {
+		.op_byte = 0xA4,
+		.op_type = VIE_OP_TYPE_MOVS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xA5] = {
+		.op_byte = 0xA5,
+		.op_type = VIE_OP_TYPE_MOVS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
@@ -559,6 +571,217 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+/*
+ * Helper function to calculate and validate a linear address.
+ *
+ * Returns 0 on success and 1 if an exception was injected into the guest.
+ */
+static int
+get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
+    int opsize, int addrsize, int prot, enum vm_reg_name seg,
+    enum vm_reg_name gpr, uint64_t *gla)
+{
+	struct seg_desc desc;
+	uint64_t cr0, val, rflags;
+	int error;
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
+	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
+	    __func__, error, seg));
+
+	error = vie_read_register(vm, vcpuid, gpr, &val);
+	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
+	    error, gpr));
+
+	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
+	    addrsize, prot, gla)) {
+		if (seg == VM_REG_GUEST_SS)
+			vm_inject_ss(vm, vcpuid, 0);
+		else
+			vm_inject_gp(vm, vcpuid);
+		return (1);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, *gla)) {
+		if (seg == VM_REG_GUEST_SS)
+			vm_inject_ss(vm, vcpuid, 0);
+		else
+			vm_inject_gp(vm, vcpuid);
+		return (1);
+	}
+
+	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (1);
+	}
+
+	return (0);
+}
+
+static int
+emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	uint64_t dstaddr, srcaddr, val;
+	uint64_t rcx, rdi, rsi, rflags;
+	int error, opsize, seg, repeat;
+
+	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
+	val = 0;
+	error = 0;
+
+	/*
+	 * XXX although the MOVS instruction is only supposed to be used with
+	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
+	 *
+	 * Empirically the "repnz" prefix has identical behavior to "rep"
+	 * and the zero flag does not make a difference.
+	 */
+	repeat = vie->repz_present | vie->repnz_present;
+
+	if (repeat) {
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+		/*
+		 * The count register is %rcx, %ecx or %cx depending on the
+		 * address size of the instruction.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+			return (0);
+	}
+
+	/*
+	 *	Source		Destination	Comments
+	 *	--------------------------------------------
+	 * (1)  memory		memory		n/a
+	 * (2)  memory		mmio		emulated
+	 * (3)  mmio		memory		emulated
+	 * (4)  mmio		mmio		not emulated
+	 *
+	 * At this point we don't have sufficient information to distinguish
+	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
+	 * out because it will succeed only when operating on regular memory.
+	 *
+	 * XXX the emulation doesn't properly handle the case where 'gpa'
+	 * is straddling the boundary between the normal memory and MMIO.
+	 */
+
+	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
+	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
+	if (error)
+		goto done;
+
+	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		/*
+		 * case (2): read from system memory and write to mmio.
+		 */
+		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+		goto done;
+	} else if (error > 0) {
+		/*
+		 * Resume guest execution to handle fault.
+		 */
+		goto done;
+	} else {
+		/*
+		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
+		 * if 'srcaddr' is in the mmio space.
+		 */
+	}
+
+	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+	    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
+	if (error)
+		goto done;
+
+	error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
+	    PROT_WRITE, copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		/*
+		 * case (3): read from MMIO and write to system memory.
+		 *
+		 * A MMIO read can have side-effects so we commit to it
+		 * only after vm_copy_setup() is successful. If a page-fault
+		 * needs to be injected into the guest then it will happen
+		 * before the MMIO read is attempted.
+		 */
+		error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+		if (error)
+			goto done;
+
+		vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+	} else if (error > 0) {
+		/*
+		 * Resume guest execution to handle fault.
+		 */
+		goto done;
+	} else {
+		goto done;
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
+	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	if (rflags & PSL_D) {
+		rsi -= opsize;
+		rdi -= opsize;
+	} else {
+		rsi += opsize;
+		rdi += opsize;
+	}
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+	if (repeat) {
+		rcx = rcx - 1;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+		    rcx, vie->addrsize);
+		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+		/*
+		 * Repeat the instruction if the count register is not zero.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+			vm_restart_instruction(vm, vcpuid);
+	}
+done:
+	if (error < 0)
+		return (EFAULT);
+	else
+		return (0);
+}
+
 static int
 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -926,9 +1149,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
 		rsp += size;
 	}
-#ifdef _KERNEL
 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-#endif
 
 	if (error == 0) {
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
@@ -1012,6 +1233,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_movx(vm, vcpuid, gpa, vie,
 				     memread, memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_MOVS:
+		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_AND:
 		error = emulate_and(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
@@ -1193,6 +1418,7 @@ vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
 
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
+	vie->segment_register = VM_REG_LAST;
 
 	if (inst_length) {
 		bcopy(inst_bytes, vie->inst, inst_length);
@@ -1458,6 +1684,35 @@ vie_advance(struct vie *vie)
 	vie->num_processed++;
 }
 
+static bool
+segment_override(uint8_t x, int *seg)
+{
+
+	switch (x) {
+	case 0x2E:
+		*seg = VM_REG_GUEST_CS;
+		break;
+	case 0x36:
+		*seg = VM_REG_GUEST_SS;
+		break;
+	case 0x3E:
+		*seg = VM_REG_GUEST_DS;
+		break;
+	case 0x26:
+		*seg = VM_REG_GUEST_ES;
+		break;
+	case 0x64:
+		*seg = VM_REG_GUEST_FS;
+		break;
+	case 0x65:
+		*seg = VM_REG_GUEST_GS;
+		break;
+	default:
+		return (false);
+	}
+	return (true);
+}
+
 static int
 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
@@ -1471,6 +1726,12 @@ decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 			vie->opsize_override = 1;
 		else if (x == 0x67)
 			vie->addrsize_override = 1;
+		else if (x == 0xF3)
+			vie->repz_present = 1;
+		else if (x == 0xF2)
+			vie->repnz_present = 1;
+		else if (segment_override(x, &vie->segment_register))
+			vie->segment_override = 1;
 		else
 			break;
 
@@ -1923,8 +2184,10 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 	if (verify_inst_length(vie))
 		return (-1);
 
-	if (verify_gla(vm, cpuid, gla, vie))
-		return (-1);
+	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
+		if (verify_gla(vm, cpuid, gla, vie))
+			return (-1);
+	}
 
 	vie->decoded = 1;	/* success */
 
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index e553599..fc68a61 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vpmtmr.h"
+#include "vrtc.h"
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 
@@ -60,6 +61,8 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = {
 	[IO_ELCR1] = vatpic_elc_handler,
 	[IO_ELCR2] = vatpic_elc_handler,
 	[IO_PMTMR] = vpmtmr_handler,
+	[IO_RTC] = vrtc_addr_handler,
+	[IO_RTC + 1] = vrtc_data_handler,
 };
 
 #ifdef KTR
@@ -71,7 +74,7 @@ inout_instruction(struct vm_exit *vmexit)
 	static const char *iodesc[] = {
 		"outb", "outw", "outl",
 		"inb", "inw", "inl",
-		"outsb", "outsw", "outsd"
+		"outsb", "outsw", "outsd",
 		"insb", "insw", "insd",
 	};
 
diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile
index 6aeaf80..6e1cf7f 100644
--- a/sys/modules/vmm/Makefile
+++ b/sys/modules/vmm/Makefile
@@ -35,7 +35,8 @@ SRCS+=	iommu.c		\
 	vhpet.c		\
 	vioapic.c	\
 	vlapic.c	\
-	vpmtmr.c
+	vpmtmr.c	\
+	vrtc.c
 
 # intel-specific files
 .PATH: ${.CURDIR}/../../amd64/vmm/intel
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 8de0989..e15f9ac 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -32,7 +32,7 @@
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
-.Op Fl abehwxACHPWY
+.Op Fl abehuwxACHPWY
 .Op Fl c Ar numcpus
 .Op Fl g Ar gdbport
 .Op Fl l Ar lpcdev Ns Op , Ns Ar conf
@@ -239,6 +239,8 @@ The host device must have been reserved at boot-time using the
 loader variable as described in
 .Xr vmm 4 .
 .El
+.It Fl u
+RTC keeps UTC time.
 .It Fl U Ar uuid
 Set the universally unique identifier
 .Pq UUID
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 5971993..97ed046 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -122,7 +122,7 @@ usage(int code)
 {
 
         fprintf(stderr,
-                "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
+                "Usage: %s [-abehuwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
 		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
 		"       -A: create ACPI tables\n"
@@ -137,6 +137,7 @@ usage(int code)
 		"       -p: pin 'vcpu' to 'hostcpu'\n"
 		"       -P: vmexit from the guest on pause\n"
 		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -u: RTC keeps UTC time\n"
 		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
 		"       -W: force virtio to use single-vector MSI\n"
@@ -185,20 +186,14 @@ vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
     int errcode)
 {
 	struct vmctx *ctx;
-	int error;
+	int error, restart_instruction;
 
 	ctx = arg;
-	if (errcode_valid)
-		error = vm_inject_exception2(ctx, vcpu, vector, errcode);
-	else
-		error = vm_inject_exception(ctx, vcpu, vector);
-	assert(error == 0);
+	restart_instruction = 1;
 
-	/*
-	 * Set the instruction length to 0 to ensure that the instruction is
-	 * restarted when the fault handler returns.
-	 */
-	vmexit[vcpu].inst_length = 0;
+	error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
+	    restart_instruction);
+	assert(error == 0);
 }
 
 void *
@@ -329,12 +324,6 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 	}
 
 	error = emulate_inout(ctx, vcpu, vme, strictio);
-	if (!error && in && !string) {
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
-		    vme->u.inout.eax);
-		assert(error == 0);
-	}
-
 	if (error) {
 		fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out",
 		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
@@ -358,7 +347,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
-			return (VMEXIT_RESTART);
+			return (VMEXIT_CONTINUE);
 		}
 	}
 
@@ -384,7 +373,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
-			return (VMEXIT_RESTART);
+			return (VMEXIT_CONTINUE);
 		}
 	}
 	return (VMEXIT_CONTINUE);
@@ -462,9 +451,11 @@ static int
 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
+	assert(vmexit->inst_length == 0);
+
 	stats.vmexit_bogus++;
 
-	return (VMEXIT_RESTART);
+	return (VMEXIT_CONTINUE);
 }
 
 static int
@@ -494,9 +485,11 @@ static int
 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
+	assert(vmexit->inst_length == 0);
+
 	stats.vmexit_mtrap++;
 
-	return (VMEXIT_RESTART);
+	return (VMEXIT_CONTINUE);
 }
 
 static int
@@ -581,7 +574,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 };
 
 static void
-vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
 {
 	int error, rc, prevcpu;
 	enum vm_exitcode exitcode;
@@ -596,8 +589,11 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 	error = vm_active_cpus(ctx, &active_cpus);
 	assert(CPU_ISSET(vcpu, &active_cpus));
 
+	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
+	assert(error == 0);
+
 	while (1) {
-		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
+		error = vm_run(ctx, vcpu, &vmexit[vcpu]);
 		if (error != 0)
 			break;
 
@@ -614,10 +610,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 
 		switch (rc) {
 		case VMEXIT_CONTINUE:
-                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
-			break;
-		case VMEXIT_RESTART:
-                        rip = vmexit[vcpu].rip;
 			break;
 		case VMEXIT_ABORT:
 			abort();
@@ -694,6 +686,7 @@ main(int argc, char *argv[])
 {
 	int c, error, gdb_port, err, bvmcons;
 	int dump_guest_memory, max_vcpus, mptgen;
+	int rtc_localtime;
 	struct vmctx *ctx;
 	uint64_t rip;
 	size_t memsize;
@@ -705,8 +698,9 @@ main(int argc, char *argv[])
 	guest_ncpus = 1;
 	memsize = 256 * MB;
 	mptgen = 1;
+	rtc_localtime = 1;
 
-	while ((c = getopt(argc, argv, "abehwxACHIPWYp:g:c:s:m:l:U:")) != -1) {
+	while ((c = getopt(argc, argv, "abehuwxACHIPWYp:g:c:s:m:l:U:")) != -1) {
 		switch (c) {
 		case 'a':
 			x2apic_mode = 0;
@@ -766,6 +760,9 @@ main(int argc, char *argv[])
 		case 'e':
 			strictio = 1;
 			break;
+		case 'u':
+			rtc_localtime = 0;
+			break;
 		case 'U':
 			guest_uuid_str = optarg;
 			break;
@@ -829,7 +826,7 @@ main(int argc, char *argv[])
 	pci_irq_init(ctx);
 	ioapic_init(ctx);
 
-	rtc_init(ctx);
+	rtc_init(ctx, rtc_localtime);
 	sci_init(ctx);
 
 	/*
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
index 87824ef..c51bf48 100644
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -35,9 +35,8 @@
 #define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]
 #endif
 
-#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
-#define	VMEXIT_RESTART		2	/* restart current instruction */
-#define	VMEXIT_ABORT		3	/* abort the vm run loop */
+#define	VMEXIT_CONTINUE		(0)
+#define	VMEXIT_ABORT		(-1)
 
 struct vmctx;
 extern int guest_ncpus;
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index 1041a59..402b953 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -104,7 +104,7 @@ int
 emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 {
 	int addrsize, bytes, flags, in, port, prot, rep;
-	uint32_t val;
+	uint32_t eax, val;
 	inout_func_t handler;
 	void *arg;
 	int error, retval;
@@ -214,16 +214,20 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 		}
 
 		/* Restart the instruction if more iterations remain */
-		if (retval == 0 && count != 0)
-			vmexit->inst_length = 0;
-	} else {
-		if (!in) {
-			val = vmexit->u.inout.eax & vie_size2mask(bytes);
+		if (retval == 0 && count != 0) {
+			error = vm_restart_instruction(ctx, vcpu);
+			assert(error == 0);
 		}
+	} else {
+		eax = vmexit->u.inout.eax;
+		val = eax & vie_size2mask(bytes);
 		retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
 		if (retval == 0 && in) {
-			vmexit->u.inout.eax &= ~vie_size2mask(bytes);
-			vmexit->u.inout.eax |= val & vie_size2mask(bytes);
+			eax &= ~vie_size2mask(bytes);
+			eax |= val & vie_size2mask(bytes);
+			error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
+			    eax);
+			assert(error == 0);
 		}
 	}
 	return (retval);
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 9dbabcd..31e02f8 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -2299,7 +2299,8 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 
 open_fail:
 	if (ret) {
-		blockif_close(sc->port[0].bctx);
+		if (sc->port[0].bctx != NULL)
+			blockif_close(sc->port[0].bctx);
 		free(sc);
 	}
 
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
index 459c900..5c70154 100644
--- a/usr.sbin/bhyve/rtc.c
+++ b/usr.sbin/bhyve/rtc.c
@@ -30,10 +30,7 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
-#include <sys/time.h>
 
-#include <stdio.h>
-#include <string.h>
 #include <time.h>
 #include <assert.h>
 
@@ -41,47 +38,11 @@ __FBSDID("$FreeBSD$");
 #include <vmmapi.h>
 
 #include "acpi.h"
-#include "inout.h"
 #include "pci_lpc.h"
 #include "rtc.h"
 
-#define	IO_RTC	0x70
+#define	IO_RTC		0x70
 
-#define RTC_SEC		0x00	/* seconds */
-#define	RTC_SEC_ALARM	0x01
-#define	RTC_MIN		0x02
-#define	RTC_MIN_ALARM	0x03
-#define	RTC_HRS		0x04
-#define	RTC_HRS_ALARM	0x05
-#define	RTC_WDAY	0x06
-#define	RTC_DAY		0x07
-#define	RTC_MONTH	0x08
-#define	RTC_YEAR	0x09
-#define	RTC_CENTURY	0x32	/* current century */
-
-#define RTC_STATUSA	0xA
-#define  RTCSA_TUP	 0x80	/* time update, don't look now */
-
-#define	RTC_STATUSB	0xB
-#define	 RTCSB_DST	 0x01
-#define	 RTCSB_24HR	 0x02
-#define	 RTCSB_BIN	 0x04	/* 0 = BCD, 1 = Binary */
-#define	 RTCSB_PINTR	 0x40	/* 1 = enable periodic clock interrupt */
-#define	 RTCSB_HALT      0x80	/* stop clock updates */
-
-#define RTC_INTR	0x0c	/* status register C (R) interrupt source */
-
-#define RTC_STATUSD	0x0d	/* status register D (R) Lost Power */
-#define  RTCSD_PWR	 0x80	/* clock power OK */
-
-#define	RTC_NVRAM_START	0x0e
-#define	RTC_NVRAM_END	0x7f
-#define RTC_NVRAM_SZ	(128 - RTC_NVRAM_START)
-#define	nvoff(x)	((x) - RTC_NVRAM_START)
-
-#define	RTC_DIAG	0x0e
-#define RTC_RSTCODE	0x0f
-#define	RTC_EQUIPMENT	0x14
 #define	RTC_LMEM_LSB	0x34
 #define	RTC_LMEM_MSB	0x35
 #define	RTC_HMEM_LSB	0x5b
@@ -92,249 +53,30 @@ __FBSDID("$FreeBSD$");
 #define	m_16MB		(16*1024*1024)
 #define	m_4GB		(4ULL*1024*1024*1024)
 
-static int addr;
-
-static uint8_t rtc_nvram[RTC_NVRAM_SZ];
-
-/* XXX initialize these to default values as they would be from BIOS */
-static uint8_t status_a, status_b;
-
-static struct {
-	uint8_t  hours;
-	uint8_t  mins;
-	uint8_t  secs;
-} rtc_alarm;
-
-static u_char const bin2bcd_data[] = {
-	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
-	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
-	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
-	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
-};
-#define	bin2bcd(bin)	(bin2bcd_data[bin])
-
-#define	rtcout(val)	((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
-
-static void
-timevalfix(struct timeval *t1)
-{
-
-	if (t1->tv_usec < 0) {
-		t1->tv_sec--;
-		t1->tv_usec += 1000000;
-	}
-	if (t1->tv_usec >= 1000000) {
-		t1->tv_sec++;
-		t1->tv_usec -= 1000000;
-	}
-}
-
-static void
-timevalsub(struct timeval *t1, const struct timeval *t2)
-{
-
-	t1->tv_sec -= t2->tv_sec;
-	t1->tv_usec -= t2->tv_usec;
-	timevalfix(t1);
-}
-
-static int
-rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
-{
-	if (bytes != 1)
-		return (-1);
-
-	if (in) {
-		/* straight read of this register will return 0xFF */
-		*eax = 0xff;
-		return (0);
-	}
-
-	switch (*eax & 0x7f) {
-	case RTC_SEC:
-	case RTC_SEC_ALARM:
-	case RTC_MIN:
-	case RTC_MIN_ALARM:
-	case RTC_HRS:
-	case RTC_HRS_ALARM:
-	case RTC_WDAY:
-	case RTC_DAY:
-	case RTC_MONTH:
-	case RTC_YEAR:
-	case RTC_STATUSA:
-	case RTC_STATUSB:
-	case RTC_INTR:
-	case RTC_STATUSD:
-	case RTC_NVRAM_START ... RTC_NVRAM_END:
-		break;
-	default:
-		return (-1);
-	}
-
-	addr = *eax & 0x7f;
-	return (0);
-}
-
-static int
-rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
+/*
+ * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970
+ */
+static time_t
+rtc_time(struct vmctx *ctx, int use_localtime)
 {
-	int hour;
+	struct tm tm;
 	time_t t;
-	struct timeval cur, delta;
-
-	static struct timeval last;
-	static struct tm tm;
-
-	if (bytes != 1)
-		return (-1);
-
-	gettimeofday(&cur, NULL);
 
-	/*
-	 * Increment the cached time only once per second so we can guarantee
-	 * that the guest has at least one second to read the hour:min:sec
-	 * separately and still get a coherent view of the time.
-	 */
-	delta = cur;
-	timevalsub(&delta, &last);
-	if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
-		t = cur.tv_sec;
+	time(&t);
+	if (use_localtime) {
 		localtime_r(&t, &tm);
-		last = cur;
+		t = timegm(&tm);
 	}
-
-	if (in) {
-		switch (addr) {
-		case RTC_SEC_ALARM:
-			*eax = rtc_alarm.secs;
-			break;
-		case RTC_MIN_ALARM:
-			*eax = rtc_alarm.mins;
-			break;
-		case RTC_HRS_ALARM:
-			*eax = rtc_alarm.hours;
-			break;
-		case RTC_SEC:
-			*eax = rtcout(tm.tm_sec);
-			return (0);
-		case RTC_MIN:
-			*eax = rtcout(tm.tm_min);
-			return (0);
-		case RTC_HRS:
-			if (status_b & RTCSB_24HR)
-				hour = tm.tm_hour;
-			else
-				hour = (tm.tm_hour % 12) + 1;
-			
-			*eax = rtcout(hour);
-
-			/*
-			 * If we are representing time in the 12-hour format
-			 * then set the MSB to indicate PM.
-			 */
-			if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
-				*eax |= 0x80;
-
-			return (0);
-		case RTC_WDAY:
-			*eax = rtcout(tm.tm_wday + 1);
-			return (0);
-		case RTC_DAY:
-			*eax = rtcout(tm.tm_mday);
-			return (0);
-		case RTC_MONTH:
-			*eax = rtcout(tm.tm_mon + 1);
-			return (0);
-		case RTC_YEAR:
-			*eax = rtcout(tm.tm_year % 100);
-			return (0);
-		case RTC_STATUSA:
-			*eax = status_a;
-			return (0);
-		case RTC_STATUSB:
-			*eax = status_b;
-			return (0);
-		case RTC_INTR:
-			*eax = 0;
-			return (0);
-		case RTC_STATUSD:
-			*eax = RTCSD_PWR;
-			return (0);
-		case RTC_NVRAM_START ... RTC_NVRAM_END:
-			*eax = rtc_nvram[addr - RTC_NVRAM_START];
-			return (0);
-		default:
-			return (-1);
-		}
-	}
-
-	switch (addr) {
-	case RTC_STATUSA:
-		status_a = *eax & ~RTCSA_TUP;
-		break;
-	case RTC_STATUSB:
-		/* XXX not implemented yet XXX */
-		if (*eax & RTCSB_PINTR)
-			return (-1);
-		status_b = *eax;
-		break;
-	case RTC_STATUSD:
-		/* ignore write */
-		break;
-	case RTC_SEC_ALARM:
-		rtc_alarm.secs = *eax;
-		break;
-	case RTC_MIN_ALARM:
-		rtc_alarm.mins = *eax;
-		break;
-	case RTC_HRS_ALARM:
-		rtc_alarm.hours = *eax;
-		break;
-	case RTC_SEC:
-	case RTC_MIN:
-	case RTC_HRS:
-	case RTC_WDAY:
-	case RTC_DAY:
-	case RTC_MONTH:
-	case RTC_YEAR:
-		/*
-		 * Ignore writes to the time of day registers
-		 */
-		break;
-	case RTC_NVRAM_START ... RTC_NVRAM_END:
-		rtc_nvram[addr - RTC_NVRAM_START] = *eax;
-		break;
-	default:
-		return (-1);
-	}
-	return (0);
+	return (t);
 }
 
 void
-rtc_init(struct vmctx *ctx)
+rtc_init(struct vmctx *ctx, int use_localtime)
 {	
-	struct timeval cur;
-	struct tm tm;
 	size_t himem;
 	size_t lomem;
 	int err;
 
-	err = gettimeofday(&cur, NULL);
-	assert(err == 0);
-	(void) localtime_r(&cur.tv_sec, &tm);
-
-	memset(rtc_nvram, 0, sizeof(rtc_nvram));
-
-	rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100);
-
 	/* XXX init diag/reset code/equipment/checksum ? */
 
 	/*
@@ -344,17 +86,22 @@ rtc_init(struct vmctx *ctx)
 	 * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
 	 */
 	lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
-	rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
-	rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
+	err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem);
+	assert(err == 0);
+	err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8);
+	assert(err == 0);
 
 	himem = vm_get_highmem_size(ctx) / m_64KB;
-	rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
-	rtc_nvram[nvoff(RTC_HMEM_SB)]  = himem >> 8;
-	rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16;
-}
+	err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem);
+	assert(err == 0);
+	err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8);
+	assert(err == 0);
+	err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16);
+	assert(err == 0);
 
-INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler);
-INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
+	err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime));
+	assert(err == 0);
+}
 
 static void
 rtc_dsdt(void)
diff --git a/usr.sbin/bhyve/rtc.h b/usr.sbin/bhyve/rtc.h
index 72cffb3..5b08ca3 100644
--- a/usr.sbin/bhyve/rtc.h
+++ b/usr.sbin/bhyve/rtc.h
@@ -29,6 +29,6 @@
 #ifndef _RTC_H_
 #define _RTC_H_
 
-void	rtc_init(struct vmctx *ctx);
+void	rtc_init(struct vmctx *ctx, int use_localtime);
 
 #endif /* _RTC_H_ */
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
index b939c1a..ba6a9d2 100644
--- a/usr.sbin/bhyve/task_switch.c
+++ b/usr.sbin/bhyve/task_switch.c
@@ -725,21 +725,11 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
 
 	/*
-	 * Calculate the %eip to store in the old TSS before modifying the
-	 * 'inst_length'.
+	 * Calculate the instruction pointer to store in the old TSS.
 	 */
 	eip = vmexit->rip + vmexit->inst_length;
 
 	/*
-	 * Set the 'inst_length' to '0'.
-	 *
-	 * If an exception is triggered during emulation of the task switch
-	 * then the exception handler should return to the instruction that
-	 * caused the task switch as opposed to the subsequent instruction.
-	 */
-	vmexit->inst_length = 0;
-
-	/*
 	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
 	 * The following page table accesses are implicitly supervisor mode:
 	 * - accesses to GDT or LDT to load segment descriptors
@@ -883,8 +873,8 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	 * after this point will be handled in the context of the new task and
 	 * the saved instruction pointer will belong to the new task.
 	 */
-	vmexit->rip = newtss.tss_eip;
-	assert(vmexit->inst_length == 0);
+	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
+	assert(error == 0);
 
 	/* Load processor state from new TSS */
 	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c
index d50a939..5b7bfbb 100644
--- a/usr.sbin/bhyve/xmsr.c
+++ b/usr.sbin/bhyve/xmsr.c
@@ -185,6 +185,15 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
 			*val = 0;
 			break;
 
+		/*
+		 * OpenBSD guests test bit 0 of this MSR to detect if the
+		 * workaround for erratum 721 is already applied.
+		 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
+		 */
+		case 0xC0011029:
+			*val = 1;
+			break;
+
 		default:
 			error = -1;
 			break;
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index 0c4457e..e2b514d 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <fcntl.h>
 #include <string.h>
 #include <getopt.h>
+#include <time.h>
 #include <assert.h>
 
 #include <machine/cpufunc.h>
@@ -157,6 +158,11 @@ usage(bool cpu_intel)
 	"       [--inject-nmi]\n"
 	"       [--force-reset]\n"
 	"       [--force-poweroff]\n"
+	"       [--get-rtc-time]\n"
+	"       [--set-rtc-time=<secs>]\n"
+	"       [--get-rtc-nvram]\n"
+	"       [--set-rtc-nvram=<val>]\n"
+	"       [--rtc-nvram-offset=<offset>]\n"
 	"       [--get-active-cpus]\n"
 	"       [--get-suspended-cpus]\n"
 	"       [--get-intinfo]\n"
@@ -220,6 +226,12 @@ usage(bool cpu_intel)
 	exit(1);
 }
 
+static int get_rtc_time, set_rtc_time;
+static int get_rtc_nvram, set_rtc_nvram;
+static int rtc_nvram_offset;
+static uint8_t rtc_nvram_value;
+static time_t rtc_secs;
+
 static int get_stats, getcap, setcap, capval, get_gpa_pmap;
 static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
@@ -545,6 +557,9 @@ enum {
 	UNASSIGN_PPTDEV,
 	GET_GPA_PMAP,
 	ASSERT_LAPIC_LVT,
+	SET_RTC_TIME,
+	SET_RTC_NVRAM,
+	RTC_NVRAM_OFFSET,
 };
 
 static void
@@ -1269,6 +1284,11 @@ setup_options(bool cpu_intel)
 		{ "setcap",	REQ_ARG,	0,	SET_CAP },
 		{ "get-gpa-pmap", REQ_ARG,	0,	GET_GPA_PMAP },
 		{ "assert-lapic-lvt", REQ_ARG,	0,	ASSERT_LAPIC_LVT },
+		{ "get-rtc-time", NO_ARG,	&get_rtc_time,	1 },
+		{ "set-rtc-time", REQ_ARG,	0,	SET_RTC_TIME },
+		{ "rtc-nvram-offset", REQ_ARG,	0,	RTC_NVRAM_OFFSET },
+		{ "get-rtc-nvram", NO_ARG,	&get_rtc_nvram,	1 },
+		{ "set-rtc-nvram", REQ_ARG,	0,	SET_RTC_NVRAM },
 		{ "getcap",	NO_ARG,		&getcap,	1 },
 		{ "get-stats",	NO_ARG,		&get_stats,	1 },
 		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
@@ -1462,6 +1482,33 @@ setup_options(bool cpu_intel)
 	return (all_opts);
 }
 
+static const char *
+wday_str(int idx)
+{
+	static const char *weekdays[] = {
+		"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
+	};
+
+	if (idx >= 0 && idx < 7)
+		return (weekdays[idx]);
+	else
+		return ("UNK");
+}
+
+static const char *
+mon_str(int idx)
+{
+	static const char *months[] = {
+		"Jan", "Feb", "Mar", "Apr", "May", "Jun",
+		"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+	};
+
+	if (idx >= 0 && idx < 12)
+		return (months[idx]);
+	else
+		return ("UNK");
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -1477,6 +1524,7 @@ main(int argc, char *argv[])
 	cpuset_t cpus;
 	bool cpu_intel;
 	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+	struct tm tm;
 	struct option *opts;
 
 	cpu_intel = cpu_vendor_intel();
@@ -1594,6 +1642,17 @@ main(int argc, char *argv[])
 			capval = strtoul(optarg, NULL, 0);
 			setcap = 1;
 			break;
+		case SET_RTC_TIME:
+			rtc_secs = strtoul(optarg, NULL, 0);
+			set_rtc_time = 1;
+			break;
+		case SET_RTC_NVRAM:
+			rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0);
+			set_rtc_nvram = 1;
+			break;
+		case RTC_NVRAM_OFFSET:
+			rtc_nvram_offset = strtoul(optarg, NULL, 0);
+			break;
 		case GET_GPA_PMAP:
 			gpa_pmap = strtoul(optarg, NULL, 0);
 			get_gpa_pmap = 1;
@@ -1971,6 +2030,31 @@ main(int argc, char *argv[])
 		}
 	}
 
+	if (!error && set_rtc_nvram)
+		error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value);
+
+	if (!error && (get_rtc_nvram || get_all)) {
+		error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value);
+		if (error == 0) {
+			printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset,
+			    rtc_nvram_value);
+		}
+	}
+
+	if (!error && set_rtc_time)
+		error = vm_rtc_settime(ctx, rtc_secs);
+
+	if (!error && (get_rtc_time || get_all)) {
+		error = vm_rtc_gettime(ctx, &rtc_secs);
+		if (error == 0) {
+			gmtime_r(&rtc_secs, &tm);
+			printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n",
+			    rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon),
+			    tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec,
+			    1900 + tm.tm_year);
+		}
+	}
+
 	if (!error && (getcap || get_all)) {
 		int captype, val, getcaptype;
 
@@ -2034,10 +2118,7 @@ main(int argc, char *argv[])
 	}
 
 	if (!error && run) {
-		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
-		assert(error == 0);
-
-		error = vm_run(ctx, vcpu, rip, &vmexit);
+		error = vm_run(ctx, vcpu, &vmexit);
 		if (error == 0)
 			dump_vm_run_exitcode(&vmexit, vcpu);
 		else