MFC r267921, r267934, r267949, r267959, r267966, r268202, r268276,

r268427, r268428, r268521, r268638, r268639, r268701, r268777, r268889, r268922, r269008, r269042, r269043, r269080, r269094, r269108, r269109, r269281, r269317, r269700, r269896, r269962, r269989. Catch bhyve up to CURRENT. Lightly tested with FreeBSD i386/amd64, Linux i386/amd64, and OpenBSD/amd64. Still resolving an issue with OpenBSD/i386. Many thanks to jhb@ for all the hard work on the prior MFCs ! r267921 - support the "mov r/m8, imm8" instruction r267934 - document options r267949 - set DMI vers/date to fixed values r267959 - doc: sort cmd flags r267966 - EPT misconf post-mortem info r268202 - use correct flag for event index r268276 - 64-bit virtio capability api r268427 - invalidate guest TLB when cr3 is updated, needed for TSS r268428 - identify vcpu's operating mode r268521 - use correct offset in guest logical-to-linear translation r268638 - chs value r268639 - chs fake values r268701 - instr emul operand/address size override prefix support r268777 - emulation for legacy x86 task switching r268889 - nested exception support r268922 - fix INVARIANTS build r269008 - emulate instructions found in the OpenBSD/i386 5.5 kernel r269042 - fix fault injection r269043 - Reduce VMEXIT_RESTARTs in task_switch.c r269080 - fix issues in PUSH emulation r269094 - simplify return values from the inout handlers r269108 - don't return -1 from the push emulation handler r269109 - avoid permanent sleep in vm_handle_hlt() r269281 - list VT-x features in base kernel dmesg r269317 - Mark AHCI fatal errors as not completed r269700 - Support PCI extended config space in bhyve r269896 - Minor cleanup r269962 - use max guest memory when creating IOMMU domain r269989 - fix interrupt mode names
author: grehan <grehan@FreeBSD.org> 2014-08-19 01:20:24 +0000
committer: grehan <grehan@FreeBSD.org> 2014-08-19 01:20:24 +0000
commit: 5d455a50f5226eff9d453943eb0b714689989802 (patch)
tree: 6aaf5296bf1aa8632ab9beb5fe4a61727ef7ab68
parent: 4791bac9b41a259cdb9821e381df8837926220e2 (diff)
download: FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.zip
FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.tar.gz
41 files changed, 3057 insertions, 594 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 9fb2308..93955c7 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/_iovec.h>
 #include <sys/cpuset.h>
 
+#include <x86/segments.h>
 #include <machine/specialreg.h>
 #include <machine/param.h>
 
@@ -327,6 +328,16 @@ vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 }
 
 int
+vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
+{
+	int error;
+
+	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
+	    &seg_desc->access);
+	return (error);
+}
+
+int
 vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
 {
 	int error;
@@ -988,7 +999,7 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 #endif
 
 int
-vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
 {
 	uint64_t gpa;
@@ -1106,3 +1117,32 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu)
 	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
 	return (error);
 }
+
+int
+vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
+	if (error == 0) {
+		*info1 = vmii.info1;
+		*info2 = vmii.info2;
+	}
+	return (error);
+}
+
+int
+vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	vmii.info1 = info1;
+	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
+	return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 067eaa0..fbb6ddd 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -66,6 +66,8 @@ int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t *base, uint32_t *limit, uint32_t *access);
+int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
+			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
 int	vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
@@ -104,6 +106,9 @@ int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
 
+int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
+int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
+
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.
  */
@@ -121,7 +126,7 @@ int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
  * The 'iovcnt' should be big enough to accomodate all GPA segments.
  * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
  */
-int	vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
 	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 957120c..74be82c 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 
+#include <amd64/vmm/intel/vmx_controls.h>
 #include <x86/isa/icu.h>
 
 /* XXX - should be in header file: */
@@ -73,6 +74,7 @@ static u_int find_cpu_vendor_id(void);
 static void print_AMD_info(void);
 static void print_AMD_assoc(int i);
 static void print_via_padlock_info(void);
+static void print_vmx_info(void);
 
 int	cpu_class;
 char machine[] = "amd64";
@@ -428,6 +430,9 @@ printcpuinfo(void)
 			if (via_feature_rng != 0 || via_feature_xcrypt != 0)
 				print_via_padlock_info();
 
+			if (cpu_feature2 & CPUID2_VMX)
+				print_vmx_info();
+
 			if ((cpu_feature & CPUID_HTT) &&
 			    cpu_vendor_id == CPU_VENDOR_AMD)
 				cpu_feature &= ~CPUID_HTT;
@@ -722,3 +727,197 @@ print_via_padlock_info(void)
 	"\015RSA"		/* PMM */
 	);
 }
+
+static uint32_t
+vmx_settable(uint64_t basic, int msr, int true_msr)
+{
+	uint64_t val;
+
+	if (basic & (1UL << 55))
+		val = rdmsr(true_msr);
+	else
+		val = rdmsr(msr);
+
+	/* Just report the controls that can be set to 1. */
+	return (val >> 32);
+}
+
+static void
+print_vmx_info(void)
+{
+	uint64_t basic, msr;
+	uint32_t entry, exit, mask, pin, proc, proc2;
+	int comma;
+
+	printf("\n  VT-x: ");
+	msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if (!(msr & IA32_FEATURE_CONTROL_VMX_EN))
+		printf("(disabled in BIOS) ");
+	basic = rdmsr(MSR_VMX_BASIC);
+	pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS,
+	    MSR_VMX_TRUE_PINBASED_CTLS);
+	proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS,
+	    MSR_VMX_TRUE_PROCBASED_CTLS);
+	if (proc & PROCBASED_SECONDARY_CONTROLS)
+		proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2,
+		    MSR_VMX_PROCBASED_CTLS2);
+	else
+		proc2 = 0;
+	exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS);
+	entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS);
+
+	if (!bootverbose) {
+		comma = 0;
+		if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT &&
+		    entry & VM_ENTRY_LOAD_PAT) {
+			printf("%sPAT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_HLT_EXITING) {
+			printf("%sHLT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_MTF) {
+			printf("%sMTF", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_PAUSE_EXITING) {
+			printf("%sPAUSE", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_ENABLE_EPT) {
+			printf("%sEPT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) {
+			printf("%sUG", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_ENABLE_VPID) {
+			printf("%sVPID", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_USE_TPR_SHADOW &&
+		    proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES &&
+		    proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE &&
+		    proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION &&
+		    proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) {
+			printf("%sVID", comma ? "," : "");
+			comma = 1;
+			if (pin & PINBASED_POSTED_INTERRUPT)
+				printf(",PostIntr");
+		}
+		return;
+	}
+
+	mask = basic >> 32;
+	printf("Basic Features=0x%b", mask,
+	"\020"
+	"\02132PA"		/* 32-bit physical addresses */
+	"\022SMM"		/* SMM dual-monitor */
+	"\027INS/OUTS"		/* VM-exit info for INS and OUTS */
+	"\030TRUE"		/* TRUE_CTLS MSRs */
+	);
+	printf("\n        Pin-Based Controls=0x%b", pin,
+	"\020"
+	"\001ExtINT"		/* External-interrupt exiting */
+	"\004NMI"		/* NMI exiting */
+	"\006VNMI"		/* Virtual NMIs */
+	"\007PreTmr"		/* Activate VMX-preemption timer */
+	"\010PostIntr"		/* Process posted interrupts */
+	);
+	printf("\n        Primary Processor Controls=0x%b", proc,
+	"\020"
+	"\003INTWIN"		/* Interrupt-window exiting */
+	"\004TSCOff"		/* Use TSC offsetting */
+	"\010HLT"		/* HLT exiting */
+	"\012INVLPG"		/* INVLPG exiting */
+	"\013MWAIT"		/* MWAIT exiting */
+	"\014RDPMC"		/* RDPMC exiting */
+	"\015RDTSC"		/* RDTSC exiting */
+	"\020CR3-LD"		/* CR3-load exiting */
+	"\021CR3-ST"		/* CR3-store exiting */
+	"\024CR8-LD"		/* CR8-load exiting */
+	"\025CR8-ST"		/* CR8-store exiting */
+	"\026TPR"		/* Use TPR shadow */
+	"\027NMIWIN"		/* NMI-window exiting */
+	"\030MOV-DR"		/* MOV-DR exiting */
+	"\031IO"		/* Unconditional I/O exiting */
+	"\032IOmap"		/* Use I/O bitmaps */
+	"\034MTF"		/* Monitor trap flag */
+	"\035MSRmap"		/* Use MSR bitmaps */
+	"\036MONITOR"		/* MONITOR exiting */
+	"\037PAUSE"		/* PAUSE exiting */
+	);
+	if (proc & PROCBASED_SECONDARY_CONTROLS)
+		printf("\n        Secondary Processor Controls=0x%b", proc2,
+		"\020"
+		"\001APIC"		/* Virtualize APIC accesses */
+		"\002EPT"		/* Enable EPT */
+		"\003DT"		/* Descriptor-table exiting */
+		"\004RDTSCP"		/* Enable RDTSCP */
+		"\005x2APIC"		/* Virtualize x2APIC mode */
+		"\006VPID"		/* Enable VPID */
+		"\007WBINVD"		/* WBINVD exiting */
+		"\010UG"		/* Unrestricted guest */
+		"\011APIC-reg"		/* APIC-register virtualization */
+		"\012VID"		/* Virtual-interrupt delivery */
+		"\013PAUSE-loop"	/* PAUSE-loop exiting */
+		"\014RDRAND"		/* RDRAND exiting */
+		"\015INVPCID"		/* Enable INVPCID */
+		"\016VMFUNC"		/* Enable VM functions */
+		"\017VMCS"		/* VMCS shadowing */
+		"\020EPT#VE"		/* EPT-violation #VE */
+		"\021XSAVES"		/* Enable XSAVES/XRSTORS */
+		);
+	printf("\n        Exit Controls=0x%b", mask,
+	"\020"
+	"\003DR"		/* Save debug controls */
+				/* Ignore Host address-space size */
+	"\015PERF"		/* Load MSR_PERF_GLOBAL_CTRL */
+	"\020AckInt"		/* Acknowledge interrupt on exit */
+	"\023PAT-SV"		/* Save MSR_PAT */
+	"\024PAT-LD"		/* Load MSR_PAT */
+	"\025EFER-SV"		/* Save MSR_EFER */
+	"\026EFER-LD"		/* Load MSR_EFER */
+	"\027PTMR-SV"		/* Save VMX-preemption timer value */
+	);
+	printf("\n        Entry Controls=0x%b", mask,
+	"\020"
+	"\003DR"		/* Save debug controls */
+				/* Ignore IA-32e mode guest */
+				/* Ignore Entry to SMM */
+				/* Ignore Deactivate dual-monitor treatment */
+	"\016PERF"		/* Load MSR_PERF_GLOBAL_CTRL */
+	"\017PAT"		/* Load MSR_PAT */
+	"\020EFER"		/* Load MSR_EFER */
+	);
+	if (proc & PROCBASED_SECONDARY_CONTROLS &&
+	    (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) {
+		msr = rdmsr(MSR_VMX_EPT_VPID_CAP);
+		mask = msr;
+		printf("\n        EPT Features=0x%b", mask,
+		"\020"
+		"\001XO"		/* Execute-only translations */
+		"\007PW4"		/* Page-walk length of 4 */
+		"\011UC"		/* EPT paging-structure mem can be UC */
+		"\017WB"		/* EPT paging-structure mem can be WB */
+		"\0212M"		/* EPT PDE can map a 2-Mbyte page */
+		"\0221G"		/* EPT PDPTE can map a 1-Gbyte page */
+		"\025INVEPT"		/* INVEPT is supported */
+		"\026AD"		/* Accessed and dirty flags for EPT */
+		"\032single"		/* INVEPT single-context type */
+		"\033all"		/* INVEPT all-context type */
+		);
+		mask = msr >> 32;
+		printf("\n        VPID Features=0x%b", mask,
+		"\020"
+		"\001INVVPID"		/* INVVPID is supported */
+		"\011individual"	/* INVVPID individual-address type */
+		"\012single"		/* INVVPID single-context type */
+		"\013all"		/* INVVPID all-context type */
+		 /* INVVPID single-context-retaining-globals type */
+		"\014single-globals"	
+		);
+	}
+}
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 5a359e9..63a9b3f 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -29,11 +29,14 @@
 #ifndef _VMM_H_
 #define	_VMM_H_
 
+#include <x86/segments.h>
+
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
+	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
@@ -75,6 +78,10 @@ enum vm_reg_name {
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
+	VM_REG_GUEST_PDPTE0,
+	VM_REG_GUEST_PDPTE1,
+	VM_REG_GUEST_PDPTE2,
+	VM_REG_GUEST_PDPTE3,
 	VM_REG_LAST
 };
 
@@ -84,6 +91,16 @@ enum x2apic_state {
 	X2APIC_STATE_LAST
 };
 
+#define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
+#define	VM_INTINFO_DEL_ERRCODE	0x800
+#define	VM_INTINFO_RSVD		0x7ffff000
+#define	VM_INTINFO_VALID	0x80000000
+#define	VM_INTINFO_TYPE		0x700
+#define	VM_INTINFO_HWINTR	(0 << 8)
+#define	VM_INTINFO_NMI		(2 << 8)
+#define	VM_INTINFO_HWEXCEPTION	(3 << 8)
+#define	VM_INTINFO_SWINTR	(4 << 8)
+
 #ifdef _KERNEL
 
 #define	VM_MAX_NAMELEN	32
@@ -99,6 +116,7 @@ struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
+struct vm_guest_paging;
 struct pmap;
 
 typedef int	(*vmm_init_func_t)(int ipinum);
@@ -252,6 +270,14 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
+#ifdef _SYS_PROC_H_
+static int __inline
+vcpu_should_yield(struct vm *vm, int vcpu)
+{
+	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
+}
+#endif
+
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
@@ -274,21 +300,63 @@ struct vatpit *vm_atpit(struct vm *vm);
 int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
 
 /*
- * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
- * exception is pending and also updates 'vme'. The pending exception is
- * cleared when this function returns.
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
- * This function should only be called in the context of the thread that is
- * executing this vcpu.
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
  */
-int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
-void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
-void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
-void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
+struct vm_copyinfo {
+	uint64_t	gpa;
+	size_t		len;
+	void		*hva;
+	void		*cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout. 
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
@@ -322,13 +390,16 @@ struct seg_desc {
 	uint32_t	limit;
 	uint32_t	access;
 };
-#define	SEG_DESC_TYPE(desc)		((desc)->access & 0x001f)
-#define	SEG_DESC_PRESENT(desc)		((desc)->access & 0x0080)
-#define	SEG_DESC_DEF32(desc)		((desc)->access & 0x4000)
-#define	SEG_DESC_GRANULARITY(desc)	((desc)->access & 0x8000)
-#define	SEG_DESC_UNUSABLE(desc)		((desc)->access & 0x10000)
+#define	SEG_DESC_TYPE(access)		((access) & 0x001f)
+#define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
+#define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
+#define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
+#define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
+#define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
+	CPU_MODE_REAL,
+	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
@@ -364,11 +435,14 @@ struct vie {
 	uint8_t		num_valid;		/* size of the instruction */
 	uint8_t		num_processed;
 
+	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
-			rex_present:1;
+			rex_present:1,
+			opsize_override:1,	/* Operand size override */
+			addrsize_override:1;	/* Address size override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
@@ -410,6 +484,7 @@ enum vm_exitcode {
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
+	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MAX
 };
 
@@ -434,6 +509,22 @@ struct vm_inout_str {
 	struct seg_desc seg_desc;
 };
 
+enum task_switch_reason {
+	TSR_CALL,
+	TSR_IRET,
+	TSR_JMP,
+	TSR_IDT_GATE,	/* task gate in IDT */
+};
+
+struct vm_task_switch {
+	uint16_t	tsssel;		/* new TSS selector */
+	int		ext;		/* task switch due to external event */
+	uint32_t	errcode;
+	int		errcode_valid;	/* push 'errcode' on the new stack */
+	enum task_switch_reason reason;
+	struct vm_guest_paging paging;
+};
+
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
@@ -448,6 +539,7 @@ struct vm_exit {
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
+			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
@@ -487,7 +579,38 @@ struct vm_exit {
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
+		struct vm_task_switch task_switch;
 	} u;
 };
 
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+    int errcode);
+
+static void __inline
+vm_inject_ud(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static void __inline
+vm_inject_gp(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static void __inline
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static void __inline
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
 #endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 9b3b00d..e4d839e 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -189,6 +189,12 @@ struct vm_cpuset {
 #define	VM_ACTIVE_CPUS		0
 #define	VM_SUSPENDED_CPUS	1
 
+struct vm_intinfo {
+	int		vcpuid;
+	uint64_t	info1;
+	uint64_t	info2;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@@ -211,6 +217,8 @@ enum {
 	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
 
 	/* interrupt injection */
+	IOCNUM_GET_INTINFO = 28,
+	IOCNUM_SET_INTINFO = 29,
 	IOCNUM_INJECT_EXCEPTION = 30,
 	IOCNUM_LAPIC_IRQ = 31,
 	IOCNUM_INJECT_NMI = 32,
@@ -324,4 +332,8 @@ enum {
 	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
 #define	VM_GET_CPUS	\
 	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define	VM_SET_INTINFO	\
+	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
+#define	VM_GET_INTINFO	\
+	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
 #endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index e4c408b..bbd3d88 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
  * s
  */
 int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
-			    mem_region_read_t mrr, mem_region_write_t mrw,
-			    void *mrarg);
+    struct vm_guest_paging *paging, mem_region_read_t mrr,
+    mem_region_write_t mrw, void *mrarg);
 
 int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
     uint64_t val, int size);
@@ -108,7 +108,7 @@ void vie_init(struct vie *vie);
  */
 #define	VIE_INVALID_GLA		(1UL << 63)	/* a non-canonical address */
 int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
-			   enum vm_cpu_mode cpu_mode, struct vie *vie);
+			   enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
 #endif	/* _KERNEL */
 
 #endif	/* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index cc97d95..51e5c2c 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -103,6 +103,14 @@ vmcs_field_encoding(int ident)
 		return (VMCS_GUEST_LDTR_SELECTOR);
 	case VM_REG_GUEST_EFER:
 		return (VMCS_GUEST_IA32_EFER);
+	case VM_REG_GUEST_PDPTE0:
+		return (VMCS_GUEST_PDPTE0);
+	case VM_REG_GUEST_PDPTE1:
+		return (VMCS_GUEST_PDPTE1);
+	case VM_REG_GUEST_PDPTE2:
+		return (VMCS_GUEST_PDPTE2);
+	case VM_REG_GUEST_PDPTE3:
+		return (VMCS_GUEST_PDPTE3);
 	default:
 		return (-1);
 	}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 657d5b0..4e9557c 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t val)
 #define	VMCS_INTR_T_HWINTR	(0 << 8)
 #define	VMCS_INTR_T_NMI		(2 << 8)
 #define	VMCS_INTR_T_HWEXCEPTION	(3 << 8)
+#define	VMCS_INTR_T_SWINTR	(4 << 8)
+#define	VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define	VMCS_INTR_T_SWEXCEPTION	(6 << 8)
 #define	VMCS_INTR_DEL_ERRCODE	(1 << 11)
 
 /*
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 2cbb159..b2c5702 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -149,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
-static int vmx_no_patmsr;
-
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
@@ -158,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 /*
  * Optional capabilities
  */
+static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
+
+static int vmx_patmsr;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
+    "PAT MSR saved and restored in VCMS");
+
 static int cap_halt_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
+    "HLT triggers a VM-exit");
+
 static int cap_pause_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
+    0, "PAUSE triggers a VM-exit");
+
 static int cap_unrestricted_guest;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
+    &cap_unrestricted_guest, 0, "Unrestricted guests");
+
 static int cap_monitor_trap;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
+    &cap_monitor_trap, 0, "Monitor trap flag");
+
 static int cap_invpcid;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
+    0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec;
@@ -618,6 +636,7 @@ vmx_init(int ipinum)
 	}
 
 	/* Check support for VM-exit controls */
+	vmx_patmsr = 1;
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
@@ -637,12 +656,12 @@ vmx_init(int ipinum)
 			if (bootverbose)
 				printf("vmm: PAT MSR access not supported\n");
 			guest_msr_valid(MSR_PAT);
-			vmx_no_patmsr = 1;
+			vmx_patmsr = 0;
 		}
 	}
 
 	/* Check support for VM-entry controls */
-	if (!vmx_no_patmsr) {
+	if (vmx_patmsr) {
 		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
 				       MSR_VMX_TRUE_ENTRY_CTLS,
 				       VM_ENTRY_CTLS_ONE_SETTING,
@@ -918,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * MSR_PAT save/restore support, leave access disabled so accesses
 	 * will be trapped.
 	 */
-	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+	if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
 		panic("vmx_vminit: error setting guest pat msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
@@ -974,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
-		vmx->state[i].lastcpu = -1;
+		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
@@ -1047,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
+static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
-static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+/*
+ * Invalidate guest mappings identified by its vpid from the TLB.
+ */
+static __inline void
+vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
-	if (vmxstate->lastcpu == curcpu)
+	if (vmxstate->vpid == 0)
 		return;
 
-	vmxstate->lastcpu = curcpu;
-
-	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+	if (!running) {
+		/*
+		 * Set the 'lastcpu' to an invalid host cpu.
+		 *
+		 * This will invalidate TLB entries tagged with the vcpu's
+		 * vpid the next time it runs via vmx_set_pcpu_defaults().
+		 */
+		vmxstate->lastcpu = NOCPU;
+		return;
+	}
 
-	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
-	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
-	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
+	    "critical section", __func__, vcpu));
 
 	/*
-	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
@@ -1081,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
-	if (vmxstate->vpid != 0) {
-		if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
-			invvpid_desc._res1 = 0;
-			invvpid_desc._res2 = 0;
-			invvpid_desc.vpid = vmxstate->vpid;
-			invvpid_desc.linear_addr = 0;
-			invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
-		} else {
-			/*
-			 * The invvpid can be skipped if an invept is going to
-			 * be performed before entering the guest. The invept
-			 * will invalidate combined mappings tagged with
-			 * 'vmx->eptp' for all vpids.
-			 */
-			vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
-		}
+	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+		invvpid_desc._res1 = 0;
+		invvpid_desc._res2 = 0;
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid_desc.linear_addr = 0;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
+	} else {
+		/*
+		 * The invvpid can be skipped if an invept is going to
+		 * be performed before entering the guest. The invept
+		 * will invalidate combined mappings tagged with
+		 * 'vmx->eptp' for all vpids.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+{
+	struct vmxstate *vmxstate;
+
+	vmxstate = &vmx->state[vcpu];
+	if (vmxstate->lastcpu == curcpu)
+		return;
+
+	vmxstate->lastcpu = curcpu;
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	vmx_invvpid(vmx, vcpu, pmap, 1);
+}
+
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
@@ -1183,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 {
-	struct vm_exception exc;
 	int vector, need_nmi_exiting, extint_pending;
-	uint64_t rflags;
+	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
-	if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
-		KASSERT(exc.vector >= 0 && exc.vector < 32,
-		    ("%s: invalid exception vector %d", __func__, exc.vector));
+	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
-		     "pending exception %d: %#x", __func__, exc.vector, info));
+		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
-		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
-		if (exc.error_code_valid) {
-			info |= VMCS_INTR_DEL_ERRCODE;
-			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+		info = entryinfo;
+		vector = info & 0xff;
+		if (vector == IDT_BP || vector == IDT_OF) {
+			/*
+			 * VT-x requires #BP and #OF to be injected as software
+			 * exceptions.
+			 */
+			info &= ~VMCS_INTR_T_MASK;
+			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
+
+		if (info & VMCS_INTR_DEL_ERRCODE)
+			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
@@ -1379,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
+static void
+vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+	uint32_t gi;
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
+	    ("NMI blocking is not in effect %#x", gi));
+}
+
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -1659,11 +1724,19 @@ vmx_cpl(void)
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
+	uint32_t csar;
 
-	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
-		return (CPU_MODE_64BIT);
-	else
-		return (CPU_MODE_COMPATIBILITY);
+	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		if (csar & 0x2000)
+			return (CPU_MODE_64BIT);	/* CS.L = 1 */
+		else
+			return (CPU_MODE_COMPATIBILITY);
+	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
+		return (CPU_MODE_PROTECTED);
+	} else {
+		return (CPU_MODE_REAL);
+	}
 }
 
 static enum vm_paging_mode
@@ -1757,10 +1830,25 @@ vmx_paging_info(struct vm_guest_paging *paging)
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
+	struct vm_guest_paging *paging;
+	uint32_t csar;
+	
+	paging = &vmexit->u.inst_emul.paging;
+
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
-	vmx_paging_info(&vmexit->u.inst_emul.paging);
+	vmx_paging_info(paging);
+	switch (paging->cpu_mode) {
+	case CPU_MODE_PROTECTED:
+	case CPU_MODE_COMPATIBILITY:
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
+		break;
+	default:
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
+	}
 }
 
 static int
@@ -1969,6 +2057,26 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 	return (UNHANDLED);
 }
 
+static enum task_switch_reason
+vmx_task_switch_reason(uint64_t qual)
+{
+	int reason;
+
+	reason = (qual >> 30) & 0x3;
+	switch (reason) {
+	case 0:
+		return (TSR_CALL);
+	case 1:
+		return (TSR_IRET);
+	case 2:
+		return (TSR_JMP);
+	case 3:
+		return (TSR_IDT_GATE);
+	default:
+		panic("%s: invalid reason %d", __func__, reason);
+	}
+}
+
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -1976,9 +2084,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
+	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
-	uint32_t reason;
-	uint64_t qual, gpa;
+	uint32_t intr_type, reason;
+	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
@@ -1994,46 +2103,99 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
-	 * VM exits that could be triggered during event injection on the
-	 * previous VM entry need to be handled specially by re-injecting
-	 * the event.
+	 * VM exits that can be triggered during event delivery need to
+	 * be handled specially by re-injecting the event if the IDT
+	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
-	switch (reason) {
-	case EXIT_REASON_EPT_FAULT:
-	case EXIT_REASON_EPT_MISCONFIG:
-	case EXIT_REASON_APIC_ACCESS:
-	case EXIT_REASON_TASK_SWITCH:
-	case EXIT_REASON_EXCEPTION:
-		idtvec_info = vmcs_idt_vectoring_info();
-		if (idtvec_info & VMCS_IDT_VEC_VALID) {
-			idtvec_info &= ~(1 << 12); /* clear undefined bit */
-			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
-			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
-				idtvec_err = vmcs_idt_vectoring_err();
-				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
-				    idtvec_err);
-			}
-			/*
-			 * If 'virtual NMIs' are being used and the VM-exit
-			 * happened while injecting an NMI during the previous
-			 * VM-entry, then clear "blocking by NMI" in the Guest
-			 * Interruptibility-state.
-			 */
-			if ((idtvec_info & VMCS_INTR_T_MASK) ==
-			    VMCS_INTR_T_NMI) {
-				 vmx_clear_nmi_blocking(vmx, vcpu);
-			}
+	idtvec_info = vmcs_idt_vectoring_info();
+	if (idtvec_info & VMCS_IDT_VEC_VALID) {
+		idtvec_info &= ~(1 << 12); /* clear undefined bit */
+		exitintinfo = idtvec_info;
+		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+			idtvec_err = vmcs_idt_vectoring_err();
+			exitintinfo |= (uint64_t)idtvec_err << 32;
+		}
+		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
+		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
+		    __func__, error));
+
+		/*
+		 * If 'virtual NMIs' are being used and the VM-exit
+		 * happened while injecting an NMI during the previous
+		 * VM-entry, then clear "blocking by NMI" in the
+		 * Guest Interruptibility-State so the NMI can be
+		 * reinjected on the subsequent VM-entry.
+		 *
+		 * However, if the NMI was being delivered through a task
+		 * gate, then the new task must start execution with NMIs
+		 * blocked so don't clear NMI blocking in this case.
+		 */
+		intr_type = idtvec_info & VMCS_INTR_T_MASK;
+		if (intr_type == VMCS_INTR_T_NMI) {
+			if (reason != EXIT_REASON_TASK_SWITCH)
+				vmx_clear_nmi_blocking(vmx, vcpu);
+			else
+				vmx_assert_nmi_blocking(vmx, vcpu);
+		}
+
+		/*
+		 * Update VM-entry instruction length if the event being
+		 * delivered was a software interrupt or software exception.
+		 */
+		if (intr_type == VMCS_INTR_T_SWINTR ||
+		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
+		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
-	default:
-		idtvec_info = 0;
-		break;
 	}
 
 	switch (reason) {
+	case EXIT_REASON_TASK_SWITCH:
+		ts = &vmexit->u.task_switch;
+		ts->tsssel = qual & 0xffff;
+		ts->reason = vmx_task_switch_reason(qual);
+		ts->ext = 0;
+		ts->errcode_valid = 0;
+		vmx_paging_info(&ts->paging);
+		/*
+		 * If the task switch was due to a CALL, JMP, IRET, software
+		 * interrupt (INT n) or software exception (INT3, INTO),
+		 * then the saved %rip references the instruction that caused
+		 * the task switch. The instruction length field in the VMCS
+		 * is valid in this case.
+		 *
+		 * In all other cases (e.g., NMI, hardware exception) the
+		 * saved %rip is one that would have been saved in the old TSS
+		 * had the task switch completed normally so the instruction
+		 * length field is not needed in this case and is explicitly
+		 * set to 0.
+		 */
+		if (ts->reason == TSR_IDT_GATE) {
+			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
+			    ("invalid idtvec_info %#x for IDT task switch",
+			    idtvec_info));
+			intr_type = idtvec_info & VMCS_INTR_T_MASK;
+			if (intr_type != VMCS_INTR_T_SWINTR &&
+			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
+			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
+				/* Task switch triggered by external event */
+				ts->ext = 1;
+				vmexit->inst_length = 0;
+				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+					ts->errcode_valid = 1;
+					ts->errcode = vmcs_idt_vectoring_err();
+				}
+			}
+		}
+		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
+		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
+		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
+		    ts->ext ? "external" : "internal",
+		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
+		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		switch (qual & 0xf) {
@@ -2179,6 +2341,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
+		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_info & 0xff) != IDT_DF &&
@@ -2396,6 +2559,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
+		vmx_inject_interrupts(vmx, vcpu, vlapic);
+
+		/*
+		 * Check for vcpu suspension after injecting events because
+		 * vmx_inject_interrupts() can suspend the vcpu due to a
+		 * triple fault.
+		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
@@ -2408,7 +2578,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 			break;
 		}
 
-		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
 			vmx_astpending_trace(vmx, vcpu, vmexit->rip);
@@ -2416,7 +2586,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 			break;
 		}
 
-		vmx_inject_interrupts(vmx, vcpu, vlapic);
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 
@@ -2584,6 +2753,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
+	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
@@ -2621,6 +2791,18 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
+
+		if (reg == VM_REG_GUEST_CR3) {
+			/*
+			 * Invalidate the guest vcpu's TLB mappings to emulate
+			 * the behavior of updating %cr3.
+			 *
+			 * XXX the processor retains global mappings when %cr3
+			 * is updated but vmx_invvpid() does not.
+			 */
+			pmap = vmx->ctx[vcpu].pmap;
+			vmx_invvpid(vmx, vcpu, pmap, running);
+		}
 	}
 
 	return (error);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index 2aba63c..a3428db 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 
 #include <machine/cpufunc.h>
+#include <machine/specialreg.h>
 
 #include "vmx_msr.h"
 
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
index e6379a9..340b0f7 100644
--- a/sys/amd64/vmm/intel/vmx_msr.h
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -29,29 +29,6 @@
 #ifndef _VMX_MSR_H_
 #define	_VMX_MSR_H_
 
-#define	MSR_VMX_BASIC			0x480
-#define	MSR_VMX_EPT_VPID_CAP		0x48C
-
-#define	MSR_VMX_PROCBASED_CTLS		0x482
-#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E
-
-#define	MSR_VMX_PINBASED_CTLS		0x481
-#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D
-
-#define	MSR_VMX_PROCBASED_CTLS2		0x48B
-
-#define	MSR_VMX_EXIT_CTLS		0x483
-#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f
-
-#define	MSR_VMX_ENTRY_CTLS		0x484
-#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490
-
-#define	MSR_VMX_CR0_FIXED0		0x486
-#define	MSR_VMX_CR0_FIXED1		0x487
-
-#define	MSR_VMX_CR4_FIXED0		0x488
-#define	MSR_VMX_CR4_FIXED1		0x489
-
 uint32_t vmx_revision(void);
 
 int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
index ca76ea8..f5ef71b 100644
--- a/sys/amd64/vmm/intel/vtd.c
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -452,6 +452,11 @@ vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
 	ptpindex = 0;
 	ptpshift = 0;
 
+	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
+	    gpa, len));
+	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
+	    "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
+
 	if (gpa & PAGE_MASK)
 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
 
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index ee6fc84..38fc458 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -195,26 +195,29 @@ vatpic_notify_intr(struct vatpic *vatpic)
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
+		 * From Section 3.6.2, "Interrupt Modes", in the
+		 * MPtable Specification, Version 1.4
+		 *
 		 * PIC interrupts are routed to both the Local APIC
 		 * and the I/O APIC to support operation in 1 of 3
 		 * modes.
 		 *
 		 * 1. Legacy PIC Mode: the PIC effectively bypasses
-		 * all APIC components.  In mode '1' the local APIC is
+		 * all APIC components.  In this mode the local APIC is
 		 * disabled and LINT0 is reconfigured as INTR to
 		 * deliver the PIC interrupt directly to the CPU.
 		 *
 		 * 2. Virtual Wire Mode: the APIC is treated as a
 		 * virtual wire which delivers interrupts from the PIC
-		 * to the CPU.  In mode '2' LINT0 is programmed as
+		 * to the CPU.  In this mode LINT0 is programmed as
 		 * ExtINT to indicate that the PIC is the source of
 		 * the interrupt.
 		 *
-		 * 3. Symmetric I/O Mode: PIC interrupts are fielded
-		 * by the I/O APIC and delivered to the appropriate
-		 * CPU.  In mode '3' the I/O APIC input 0 is
-		 * programmed as ExtINT to indicate that the PIC is
-		 * the source of the interrupt.
+		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
+		 * fielded by the I/O APIC and delivered to the appropriate
+		 * CPU.  In this mode the I/O APIC input 0 is programmed
+		 * as ExtINT to indicate that the PIC is the source of the
+		 * interrupt.
 		 */
 		atpic->intr_raised = true;
 		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index c2a9fd1..fa0200e 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -97,6 +97,7 @@ struct vcpu {
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
+	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	struct vm_exception exception;	/* (x) exception collateral */
@@ -242,6 +243,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
@@ -571,6 +573,21 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	return (0);
 }
 
+static vm_paddr_t
+vm_maxmem(struct vm *vm)
+{
+	int i;
+	vm_paddr_t gpa, maxmem;
+
+	maxmem = 0;
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
+		if (gpa > maxmem)
+			maxmem = gpa;
+	}
+	return (maxmem);
+}
+
 static void
 vm_gpa_unwire(struct vm *vm)
 {
@@ -708,7 +725,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
-		maxaddr = vmm_mem_maxaddr();
+		maxaddr = vm_maxmem(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 
 		error = vm_gpa_wire(vm);
@@ -1104,6 +1121,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 			}
 		}
 
+		/* Don't go to sleep if the vcpu thread needs to yield */
+		if (vcpu_should_yield(vm, vcpuid))
+			break;
+
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
@@ -1127,7 +1148,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 
 		t = ticks;
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
-		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
+		/*
+		 * XXX msleep_spin() cannot be interrupted by signals so
+		 * wake up periodically to check pending signals.
+		 */
+		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
@@ -1191,15 +1216,18 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
-	int error;
+	enum vm_cpu_mode cpu_mode;
+	int cs_d, error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
+	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
+	cpu_mode = paging->cpu_mode;
 
 	vie_init(vie);
 
@@ -1213,7 +1241,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	else if (error != 0)
 		panic("%s: vmm_fetch_instruction error %d", __func__, error);
 
-	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 		return (EFAULT);
 
 	/* return to userland unless this is an in-kernel emulated device */
@@ -1231,8 +1259,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 		return (0);
 	}
 
-	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
-	    retu);
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+	    mread, mwrite, retu);
 
 	return (error);
 }
@@ -1456,6 +1484,202 @@ restart:
 }
 
 int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+	struct vcpu *vcpu;
+	int type, vector;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		vector = info & 0xff;
+		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+			return (EINVAL);
+		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+			return (EINVAL);
+		if (info & VM_INTINFO_RSVD)
+			return (EINVAL);
+	} else {
+		info = 0;
+	}
+	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+	vcpu->exitintinfo = info;
+	return (0);
+}
+
+enum exc_class {
+	EXC_BENIGN,
+	EXC_CONTRIBUTORY,
+	EXC_PAGEFAULT
+};
+
+#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+	int type, vector;
+
+	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+	type = info & VM_INTINFO_TYPE;
+	vector = info & 0xff;
+
+	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+	switch (type) {
+	case VM_INTINFO_HWINTR:
+	case VM_INTINFO_SWINTR:
+	case VM_INTINFO_NMI:
+		return (EXC_BENIGN);
+	default:
+		/*
+		 * Hardware exception.
+		 *
+		 * SVM and VT-x use identical type values to represent NMI,
+		 * hardware interrupt and software interrupt.
+		 *
+		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
+		 * for exceptions except #BP and #OF. #BP and #OF use a type
+		 * value of '5' or '6'. Therefore we don't check for explicit
+		 * values of 'type' to classify 'intinfo' into a hardware
+		 * exception.
+		 */
+		break;
+	}
+
+	switch (vector) {
+	case IDT_PF:
+	case IDT_VE:
+		return (EXC_PAGEFAULT);
+	case IDT_DE:
+	case IDT_TS:
+	case IDT_NP:
+	case IDT_SS:
+	case IDT_GP:
+		return (EXC_CONTRIBUTORY);
+	default:
+		return (EXC_BENIGN);
+	}
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+    uint64_t *retinfo)
+{
+	enum exc_class exc1, exc2;
+	int type1, vector1;
+
+	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+
+	/*
+	 * If an exception occurs while attempting to call the double-fault
+	 * handler the processor enters shutdown mode (aka triple fault).
+	 */
+	type1 = info1 & VM_INTINFO_TYPE;
+	vector1 = info1 & 0xff;
+	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+		    info1, info2);
+		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+		*retinfo = 0;
+		return (0);
+	}
+
+	/*
+	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+	 */
+	exc1 = exception_class(info1);
+	exc2 = exception_class(info2);
+	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+		/* Convert nested fault into a double fault. */
+		*retinfo = IDT_DF;
+		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		*retinfo |= VM_INTINFO_DEL_ERRCODE;
+	} else {
+		/* Handle exceptions serially */
+		*retinfo = info2;
+	}
+	return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+	uint64_t info = 0;
+
+	if (vcpu->exception_pending) {
+		info = vcpu->exception.vector & 0xff;
+		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		if (vcpu->exception.error_code_valid) {
+			info |= VM_INTINFO_DEL_ERRCODE;
+			info |= (uint64_t)vcpu->exception.error_code << 32;
+		}
+	}
+	return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+	struct vcpu *vcpu;
+	uint64_t info1, info2;
+	int valid;
+
+	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	info1 = vcpu->exitintinfo;
+	vcpu->exitintinfo = 0;
+
+	info2 = 0;
+	if (vcpu->exception_pending) {
+		info2 = vcpu_exception_intinfo(vcpu);
+		vcpu->exception_pending = 0;
+		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+		    vcpu->exception.vector, info2);
+	}
+
+	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+	} else if (info1 & VM_INTINFO_VALID) {
+		*retinfo = info1;
+		valid = 1;
+	} else if (info2 & VM_INTINFO_VALID) {
+		*retinfo = info2;
+		valid = 1;
+	} else {
+		valid = 0;
+	}
+
+	if (valid) {
+		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+	}
+
+	return (valid);
+}
+
+int
+vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	*info1 = vcpu->exitintinfo;
+	*info2 = vcpu_exception_intinfo(vcpu);
+	return (0);
+}
+
+int
 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
 	struct vcpu *vcpu;
@@ -1466,6 +1690,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	if (exception->vector < 0 || exception->vector >= 32)
 		return (EINVAL);
 
+	/*
+	 * A double fault exception should never be injected directly into
+	 * the guest. It is a derived exception that results from specific
+	 * combinations of nested faults.
+	 */
+	if (exception->vector == IDT_DF)
+		return (EINVAL);
+
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
@@ -1481,32 +1713,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	return (0);
 }
 
-int
-vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
-{
-	struct vcpu *vcpu;
-	int pending;
-
-	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
-
-	vcpu = &vm->vcpu[vcpuid];
-	pending = vcpu->exception_pending;
-	if (pending) {
-		vcpu->exception_pending = 0;
-		*exception = vcpu->exception;
-		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
-		    exception->vector);
-	}
-	return (pending);
-}
-
-static void
-vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+    int errcode)
 {
+	struct vm_exception exception;
 	struct vm_exit *vmexit;
+	struct vm *vm;
 	int error;
 
-	error = vm_inject_exception(vm, vcpuid, exception);
+	vm = vmarg;
+
+	exception.vector = vector;
+	exception.error_code = errcode;
+	exception.error_code_valid = errcode_valid;
+	error = vm_inject_exception(vm, vcpuid, &exception);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 
 	/*
@@ -1521,45 +1742,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
 }
 
 void
-vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
-	struct vm_exception pf = {
-		.vector = IDT_PF,
-		.error_code_valid = 1,
-		.error_code = error_code
-	};
+	struct vm *vm;
 	int error;
 
+	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
-	vm_inject_fault(vm, vcpuid, &pf);
-}
-
-void
-vm_inject_gp(struct vm *vm, int vcpuid)
-{
-	struct vm_exception gpf = {
-		.vector = IDT_GP,
-		.error_code_valid = 1,
-		.error_code = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &gpf);
-}
-
-void
-vm_inject_ud(struct vm *vm, int vcpuid)
-{
-	struct vm_exception udf = {
-		.vector = IDT_UD,
-		.error_code_valid = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &udf);
+	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
@@ -1993,6 +2188,97 @@ vm_segment_name(int seg)
 	return (seg_names[seg]);
 }
 
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int idx;
+
+	for (idx = 0; idx < num_copyinfo; idx++) {
+		if (copyinfo[idx].cookie != NULL)
+			vm_gpa_release(copyinfo[idx].cookie);
+	}
+	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int error, idx, nused;
+	size_t n, off, remaining;
+	void *hva, *cookie;
+	uint64_t gpa;
+
+	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+	nused = 0;
+	remaining = len;
+	while (remaining > 0) {
+		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+		if (error)
+			return (error);
+		off = gpa & PAGE_MASK;
+		n = min(remaining, PAGE_SIZE - off);
+		copyinfo[nused].gpa = gpa;
+		copyinfo[nused].len = n;
+		remaining -= n;
+		gla += n;
+		nused++;
+	}
+
+	for (idx = 0; idx < nused; idx++) {
+		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+		    prot, &cookie);
+		if (hva == NULL)
+			break;
+		copyinfo[idx].hva = hva;
+		copyinfo[idx].cookie = cookie;
+	}
+
+	if (idx != nused) {
+		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+    size_t len)
+{
+	char *dst;
+	int idx;
+	
+	dst = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		dst += copyinfo[idx].len;
+		idx++;
+	}
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len)
+{
+	const char *src;
+	int idx;
+
+	src = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		src += copyinfo[idx].len;
+		idx++;
+	}
+}
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f3e31a3..a85109e 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -173,6 +173,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_gla2gpa *gg;
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
+	struct vm_intinfo *vmii;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -199,6 +200,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
 	case VM_ACTIVATE_CPU:
+	case VM_SET_INTINFO:
+	case VM_GET_INTINFO:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -470,6 +473,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
+	case VM_SET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
+		break;
+	case VM_GET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
+		    &vmii->info2);
+		break;
 	default:
 		error = ENOTTY;
 		break;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 921deb5..a65b125 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #else	/* !_KERNEL */
 #include <sys/types.h>
 #include <sys/errno.h>
+#include <sys/_iovec.h>
 
 #include <machine/vmm.h>
 
@@ -65,18 +66,26 @@ enum {
 	VIE_OP_TYPE_AND,
 	VIE_OP_TYPE_OR,
 	VIE_OP_TYPE_TWO_BYTE,
+	VIE_OP_TYPE_PUSH,
+	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_LAST
 };
 
 /* struct vie_op.op_flags */
-#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
-#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
+#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
+#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
+#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
+#define	VIE_OP_F_NO_MODRM	(1 << 3)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xB7] = {
+		.op_byte = 0xB7,
+		.op_type = VIE_OP_TYPE_MOVZX,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -88,6 +97,10 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
+	[0x3B] = {
+		.op_byte = 0x3B,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -104,6 +117,22 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x8B,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
+	[0xA1] = {
+		.op_byte = 0xA1,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xA3] = {
+		.op_byte = 0xA3,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xC6] = {
+		/* XXX Group 11 extended opcode - not just MOV */
+		.op_byte = 0xC6,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0xC7] = {
 		.op_byte = 0xC7,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -125,6 +154,11 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_OR,
 		.op_flags = VIE_OP_F_IMM8,
 	},
+	[0xFF] = {
+		/* XXX Group 5 extended opcode - not just PUSH */
+		.op_byte = 0xFF,
+		.op_type = VIE_OP_TYPE_PUSH,
+	}
 };
 
 /* struct vie.mod */
@@ -175,18 +209,15 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
 	return (error);
 }
 
-static int
-vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+static void
+vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
 {
-	uint64_t val;
-	int error, rshift;
-	enum vm_reg_name reg;
-
-	rshift = 0;
-	reg = gpr_map[vie->reg];
+	*lhbr = 0;
+	*reg = gpr_map[vie->reg];
 
 	/*
-	 * 64-bit mode imposes limitations on accessing legacy byte registers.
+	 * 64-bit mode imposes limitations on accessing legacy high byte
+	 * registers (lhbr).
 	 *
 	 * The legacy high-byte registers cannot be addressed if the REX
 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
@@ -198,17 +229,56 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
 	 */
 	if (!vie->rex_present) {
 		if (vie->reg & 0x4) {
-			/*
-			 * Obtain the value of %ah by reading %rax and shifting
-			 * right by 8 bits (same for %bh, %ch and %dh).
-			 */
-			rshift = 8;
-			reg = gpr_map[vie->reg & 0x3];
+			*lhbr = 1;
+			*reg = gpr_map[vie->reg & 0x3];
 		}
 	}
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+	uint64_t val;
+	int error, lhbr;
+	enum vm_reg_name reg;
 
+	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &val);
-	*rval = val >> rshift;
+
+	/*
+	 * To obtain the value of a legacy high byte register shift the
+	 * base register right by 8 bits (%ah = %rax >> 8).
+	 */
+	if (lhbr)
+		*rval = val >> 8;
+	else
+		*rval = val;
+	return (error);
+}
+
+static int
+vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
+{
+	uint64_t origval, val, mask;
+	int error, lhbr;
+	enum vm_reg_name reg;
+
+	vie_calc_bytereg(vie, &reg, &lhbr);
+	error = vm_get_register(vm, vcpuid, reg, &origval);
+	if (error == 0) {
+		val = byte;
+		mask = 0xff;
+		if (lhbr) {
+			/*
+			 * Shift left by 8 to store 'byte' in a legacy high
+			 * byte register.
+			 */
+			val <<= 8;
+			mask <<= 8;
+		}
+		val |= origval & ~mask;
+		error = vm_set_register(vm, vcpuid, reg, val);
+	}
 	return (error);
 }
 
@@ -242,16 +312,52 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 }
 
 /*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- *   - default address size is 64-bits
- *   - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
+ * Return the status flags that would result from doing (x - y).
  */
+static u_long
+getcc16(uint16_t x, uint16_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc32(uint32_t x, uint32_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc64(uint64_t x, uint64_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+	    ("getcc: invalid operand size %d", opsize));
+
+	if (opsize == 2)
+		return (getcc16(x, y));
+	else if (opsize == 4)
+		return (getcc32(x, y));
+	else
+		return (getcc64(x, y));
+}
+
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -261,7 +367,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	uint8_t byte;
 	uint64_t val;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -271,7 +377,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * 88/r:	mov r/m8, r8
 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
 		 */
-		size = 1;
+		size = 1;	/* override for byte operation */
 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
 		if (error == 0)
 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
@@ -279,11 +385,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	case 0x89:
 		/*
 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m16, r16
 		 * 89/r:	mov r/m32, r32
 		 * REX.W + 89/r	mov r/m64, r64
 		 */
-		if (vie->rex_w)
-			size = 8;
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val);
 		if (error == 0) {
@@ -292,38 +397,72 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		}
 		break;
 	case 0x8A:
+		/*
+		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8A/r:	mov r8, r/m8
+		 * REX + 8A/r:	mov r8, r/m8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0)
+			error = vie_write_bytereg(vm, vcpuid, vie, val);
+		break;
 	case 0x8B:
 		/*
 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
-		 * 8A/r:	mov r/m8, r8
-		 * REX + 8A/r:	mov r/m8, r8
+		 * 8B/r:	mov r16, r/m16
 		 * 8B/r:	mov r32, r/m32
 		 * REX.W 8B/r:	mov r64, r/m64
 		 */
-		if (vie->op.op_byte == 0x8A)
-			size = 1;
-		else if (vie->rex_w)
-			size = 8;
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = gpr_map[vie->reg];
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
+	case 0xA1:
+		/*
+		 * MOV from seg:moffset to AX/EAX/RAX
+		 * A1:		mov AX, moffs16
+		 * A1:		mov EAX, moffs32
+		 * REX.W + A1:	mov RAX, moffs64
+		 */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = VM_REG_GUEST_RAX;
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xA3:
+		/*
+		 * MOV from AX/EAX/RAX to seg:moffset
+		 * A3:		mov moffs16, AX
+		 * A3:		mov moffs32, EAX 
+		 * REX.W + A3:	mov moffs64, RAX
+		 */
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0xC6:
+		/*
+		 * MOV from imm8 to mem (ModRM:r/m)
+		 * C6/0		mov r/m8, imm8
+		 * REX + C6/0	mov r/m8, imm8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
+		break;
 	case 0xC7:
 		/*
-		 * MOV from imm32 to mem (ModRM:r/m)
+		 * MOV from imm16/imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m16, imm16
 		 * C7/0		mov r/m32, imm32
 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
 		 */
-		val = vie->immediate;		/* already sign-extended */
-
-		if (vie->rex_w)
-			size = 8;
-
-		if (size != 8)
-			val &= size2mask[size];
-
+		val = vie->immediate & size2mask[size];
 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		break;
 	default:
@@ -333,17 +472,6 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
-/*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- *   - default address size is 64-bits
- *   - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
- */
 static int
 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	     mem_region_read_t memread, mem_region_write_t memwrite,
@@ -353,7 +481,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	enum vm_reg_name reg;
 	uint64_t val;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -362,8 +490,9 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * MOV and zero extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
-		 * 0F B6/r		movzx r/m8, r32
-		 * REX.W + 0F B6/r	movzx r/m8, r64
+		 * 0F B6/r		movzx r16, r/m8
+		 * 0F B6/r		movzx r32, r/m8
+		 * REX.W + 0F B6/r	movzx r64, r/m8
 		 */
 
 		/* get the first operand */
@@ -374,19 +503,39 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
-		if (vie->rex_w)
-			size = 8;
+		/* zero-extend byte */
+		val = (uint8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
+	case 0xB7:
+		/*
+		 * MOV and zero extend word from mem (ModRM:r/m) to
+		 * reg (ModRM:reg).
+		 *
+		 * 0F B7/r		movzx r32, r/m16
+		 * REX.W + 0F B7/r	movzx r64, r/m16
+		 */
+		error = memread(vm, vcpuid, gpa, &val, 2, arg);
+		if (error)
+			return (error);
+
+		reg = gpr_map[vie->reg];
+
+		/* zero-extend word */
+		val = (uint16_t)val;
+
+		error = vie_update_register(vm, vcpuid, reg, val, size);
+		break;
 	case 0xBE:
 		/*
 		 * MOV and sign extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
-		 * 0F BE/r		movsx r/m8, r32
-		 * REX.W + 0F BE/r	movsx r/m8, r64
+		 * 0F BE/r		movsx r16, r/m8
+		 * 0F BE/r		movsx r32, r/m8
+		 * REX.W + 0F BE/r	movsx r64, r/m8
 		 */
 
 		/* get the first operand */
@@ -397,9 +546,6 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
-		if (vie->rex_w)
-			size = 8;
-
 		/* sign extend byte */
 		val = (int8_t)val;
 
@@ -420,7 +566,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	enum vm_reg_name reg;
 	uint64_t val1, val2;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -429,11 +575,10 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
 		 * result in reg.
 		 *
+		 * 23/r		and r16, r/m16
 		 * 23/r		and r32, r/m32
 		 * REX.W + 23/r	and r64, r/m64
 		 */
-		if (vie->rex_w)
-			size = 8;
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
@@ -455,8 +600,9 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * AND mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 81/          and r/m32, imm32
-		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
+		 * 81 /4		and r/m16, imm16
+		 * 81 /4		and r/m32, imm32
+		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
 		 *
 		 * Currently, only the AND operation of the 0x81 opcode
 		 * is implemented (ModRM:reg = b100).
@@ -464,9 +610,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		if ((vie->reg & 7) != 4)
 			break;
 
-		if (vie->rex_w)
-			size = 8;
-		
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
@@ -492,7 +635,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	int error, size;
 	uint64_t val1;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -501,8 +644,9 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 83/          OR r/m32, imm8 sign-extended to 32
-		 * REX.W + 83/  OR r/m64, imm8 sign-extended to 64
+		 * 83 /1		OR r/m16, imm8 sign-extended to 16
+		 * 83 /1		OR r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
 		 *
 		 * Currently, only the OR operation of the 0x83 opcode
 		 * is implemented (ModRM:reg = b001).
@@ -510,9 +654,6 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		if ((vie->reg & 7) != 1)
 			break;
 
-		if (vie->rex_w)
-			size = 8;
-		
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
@@ -531,10 +672,167 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t op1, op2, rflags, rflags2;
+	enum vm_reg_name reg;
+
+	size = vie->opsize;
+	switch (vie->op.op_byte) {
+	case 0x3B:
+		/*
+		 * 3B/r		CMP r16, r/m16
+		 * 3B/r		CMP r32, r/m32
+		 * REX.W + 3B/r	CMP r64, r/m64
+		 *
+		 * Compare first operand (reg) with second operand (r/m) and
+		 * set status flags in EFLAGS register. The comparison is
+		 * performed by subtracting the second operand from the first
+		 * operand and then setting the status flags.
+		 */
+
+		/* Get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &op1);
+		if (error)
+			return (error);
+
+		/* Get the second operand */
+		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		if (error)
+			return (error);
+
+		break;
+	default:
+		return (EINVAL);
+	}
+	rflags2 = getcc(size, op1, op2);
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	struct seg_desc ss_desc;
+	uint64_t cr0, rflags, rsp, stack_gla, val;
+	int error, size, stackaddrsize;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * PUSH is part of the group 5 extended opcodes and is identified
+	 * by ModRM:reg = b110.
+	 */
+	if ((vie->reg & 7) != 6)
+		return (EINVAL);
+
+	size = vie->opsize;
+	/*
+	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+	 */
+	if (paging->cpu_mode == CPU_MODE_REAL) {
+		stackaddrsize = 2;
+	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
+		 * - Stack pointer size is always 64-bits.
+		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
+		 * - 16-bit PUSH/POP is supported by using the operand size
+		 *   override prefix (66H).
+		 */
+		stackaddrsize = 8;
+		size = vie->opsize_override ? 2 : 8;
+	} else {
+		/*
+		 * In protected or compability mode the 'B' flag in the
+		 * stack-segment descriptor determines the size of the
+		 * stack pointer.
+		 */
+		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+		    __func__, error));
+		if (SEG_DESC_DEF32(ss_desc.access))
+			stackaddrsize = 4;
+		else
+			stackaddrsize = 2;
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+
+	rsp -= size;
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (0);
+	}
+
+	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
+	    copyinfo, nitems(copyinfo));
+	if (error == -1) {
+		/*
+		 * XXX cannot return a negative error value here because it
+		 * ends up being the return value of the VM_RUN() ioctl and
+		 * is interpreted as a pseudo-error (for e.g. ERESTART).
+		 */
+		return (EFAULT);
+	} else if (error == 1) {
+		/* Resume guest execution to handle page fault */
+		return (0);
+	}
+
+	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+	if (error == 0) {
+		vm_copyout(vm, vcpuid, &val, copyinfo, size);
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+		    stackaddrsize);
+		KASSERT(error == 0, ("error %d updating rsp", error));
+	}
+#ifdef _KERNEL
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+	return (error);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
-			mem_region_read_t memread, mem_region_write_t memwrite,
-			void *memarg)
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
@@ -542,6 +840,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_PUSH:
+		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_CMP:
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_MOV:
 		error = emulate_mov(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
@@ -636,7 +942,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		 * then the descriptor is unusable and attempting to use
 		 * it results in a #GP(0).
 		 */
-		if (SEG_DESC_UNUSABLE(desc))
+		if (SEG_DESC_UNUSABLE(desc->access))
 			return (-1);
 
 		/* 
@@ -645,13 +951,13 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		 * descriptor that is not present. If this was the case then
 		 * it would have been checked before the VM-exit.
 		 */
-		KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x",
-		    seg, desc->access));
+		KASSERT(SEG_DESC_PRESENT(desc->access),
+		    ("segment %d not present: %#x", seg, desc->access));
 
 		/*
 		 * The descriptor type must indicate a code/data segment.
 		 */
-		type = SEG_DESC_TYPE(desc);
+		type = SEG_DESC_TYPE(desc->access);
 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 		    "descriptor type %#x", seg, type));
 
@@ -680,7 +986,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		if ((type & 0xC) == 0x4) {
 			/* expand-down data segment */
 			low_limit = desc->limit + 1;
-			high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff;
+			high_limit = SEG_DESC_DEF32(desc->access) ?
+			    0xffffffff : 0xffff;
 		} else {
 			/* code segment or expand-up data segment */
 			low_limit = 0;
@@ -947,45 +1254,24 @@ fault:
 }
 
 int
-vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t rip, int inst_length, struct vie *vie)
 {
-	int n, error, prot;
-	uint64_t gpa, off;
-	void *hpa, *cookie;
-
-	/*
-	 * XXX cache previously fetched instructions using 'rip' as the tag
-	 */
+	struct vm_copyinfo copyinfo[2];
+	int error, prot;
 
-	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
 
-	/* Copy the instruction into 'vie' */
-	while (vie->num_valid < inst_length) {
-		error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
-		if (error)
-			return (error);
-
-		off = gpa & PAGE_MASK;
-		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
-
-		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
-			break;
-
-		bcopy(hpa, &vie->inst[vie->num_valid], n);
-
-		vm_gpa_release(cookie);
-
-		rip += n;
-		vie->num_valid += n;
+	prot = PROT_READ | PROT_EXEC;
+	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		vie->num_valid = inst_length;
 	}
-
-	if (vie->num_valid == inst_length)
-		return (0);
-	else
-		return (-1);
+	return (error);
 }
 
 static int
@@ -1007,24 +1293,65 @@ vie_advance(struct vie *vie)
 }
 
 static int
-decode_rex(struct vie *vie)
+decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
 	uint8_t x;
 
-	if (vie_peek(vie, &x))
-		return (-1);
+	while (1) {
+		if (vie_peek(vie, &x))
+			return (-1);
 
-	if (x >= 0x40 && x <= 0x4F) {
-		vie->rex_present = 1;
+		if (x == 0x66)
+			vie->opsize_override = 1;
+		else if (x == 0x67)
+			vie->addrsize_override = 1;
+		else
+			break;
+
+		vie_advance(vie);
+	}
 
+	/*
+	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
+	 * - Only one REX prefix is allowed per instruction.
+	 * - The REX prefix must immediately precede the opcode byte or the
+	 *   escape opcode byte.
+	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
+	 *   the mandatory prefix must come before the REX prefix.
+	 */
+	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
+		vie->rex_present = 1;
 		vie->rex_w = x & 0x8 ? 1 : 0;
 		vie->rex_r = x & 0x4 ? 1 : 0;
 		vie->rex_x = x & 0x2 ? 1 : 0;
 		vie->rex_b = x & 0x1 ? 1 : 0;
-
 		vie_advance(vie);
 	}
 
+	/*
+	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
+	 */
+	if (cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * Default address size is 64-bits and default operand size
+		 * is 32-bits.
+		 */
+		vie->addrsize = vie->addrsize_override ? 4 : 8;
+		if (vie->rex_w)
+			vie->opsize = 8;
+		else if (vie->opsize_override)
+			vie->opsize = 2;
+		else
+			vie->opsize = 4;
+	} else if (cs_d) {
+		/* Default address and operand sizes are 32-bits */
+		vie->addrsize = vie->addrsize_override ? 2 : 4;
+		vie->opsize = vie->opsize_override ? 2 : 4;
+	} else {
+		/* Default address and operand sizes are 16-bits */
+		vie->addrsize = vie->addrsize_override ? 4 : 2;
+		vie->opsize = vie->opsize_override ? 4 : 2;
+	}
 	return (0);
 }
 
@@ -1071,6 +1398,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 {
 	uint8_t x;
 
+	if (cpu_mode == CPU_MODE_REAL)
+		return (-1);
+
+	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+		return (0);
+
 	if (vie_peek(vie, &x))
 		return (-1);
 
@@ -1249,20 +1582,32 @@ decode_immediate(struct vie *vie)
 	union {
 		char	buf[4];
 		int8_t	signed8;
+		int16_t	signed16;
 		int32_t	signed32;
 	} u;
 
 	/* Figure out immediate operand size (if any) */
-	if (vie->op.op_flags & VIE_OP_F_IMM)
-		vie->imm_bytes = 4;
-	else if (vie->op.op_flags & VIE_OP_F_IMM8)
+	if (vie->op.op_flags & VIE_OP_F_IMM) {
+		/*
+		 * Section 2.2.1.5 "Immediates", Intel SDM:
+		 * In 64-bit mode the typical size of immediate operands
+		 * remains 32-bits. When the operand size if 64-bits, the
+		 * processor sign-extends all immediates to 64-bits prior
+		 * to their use.
+		 */
+		if (vie->opsize == 4 || vie->opsize == 8)
+			vie->imm_bytes = 4;
+		else
+			vie->imm_bytes = 2;
+	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 		vie->imm_bytes = 1;
+	}
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
-	if (n != 1 && n != 4)
-		panic("decode_immediate: invalid imm_bytes %d", n);
+	KASSERT(n == 1 || n == 2 || n == 4,
+	    ("%s: invalid number of immediate bytes: %d", __func__, n));
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
@@ -1271,12 +1616,47 @@ decode_immediate(struct vie *vie)
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
-	
+
+	/* sign-extend the immediate value before use */
 	if (n == 1)
-		vie->immediate = u.signed8;		/* sign-extended */
+		vie->immediate = u.signed8;
+	else if (n == 2)
+		vie->immediate = u.signed16;
 	else
-		vie->immediate = u.signed32;		/* sign-extended */
+		vie->immediate = u.signed32;
+
+	return (0);
+}
+
+static int
+decode_moffset(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[8];
+		uint64_t u64;
+	} u;
+
+	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+		return (0);
+
+	/*
+	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+	 * The memory offset size follows the address-size of the instruction.
+	 */
+	n = vie->addrsize;
+	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+	u.u64 = 0;
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
 
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	vie->displacement = u.u64;
 	return (0);
 }
 
@@ -1301,7 +1681,7 @@ static int
 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 {
 	int error;
-	uint64_t base, idx;
+	uint64_t base, idx, gla2;
 
 	/* Skip 'gla' verification */
 	if (gla == VIE_INVALID_GLA)
@@ -1334,11 +1714,14 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 		}
 	}
 
-	if (base + vie->scale * idx + vie->displacement != gla) {
+	/* XXX assuming that the base address of the segment is 0 */
+	gla2 = base + vie->scale * idx + vie->displacement;
+	gla2 &= size2mask[vie->addrsize];
+	if (gla != gla2) {
 		printf("verify_gla mismatch: "
 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
-		       "disp(0x%0lx), gla(0x%0lx)\n",
-		       base, vie->scale, idx, vie->displacement, gla);
+		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla, gla2);
 		return (-1);
 	}
 
@@ -1347,13 +1730,11 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 
 int
 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
-		       enum vm_cpu_mode cpu_mode, struct vie *vie)
+		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 {
 
-	if (cpu_mode == CPU_MODE_64BIT) {
-		if (decode_rex(vie))
-			return (-1);
-	}
+	if (decode_prefixes(vie, cpu_mode, cs_d))
+		return (-1);
 
 	if (decode_opcode(vie))
 		return (-1);
@@ -1366,10 +1747,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 
 	if (decode_displacement(vie))
 		return (-1);
-	
+
 	if (decode_immediate(vie))
 		return (-1);
 
+	if (decode_moffset(vie))
+		return (-1);
+
 	if (verify_inst_length(vie))
 		return (-1);
 
diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h
index c0cef2c..8610684 100644
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@@ -436,6 +436,25 @@
 #define	MSR_MC4_MISC		0x413
 
 /*
+ * VMX MSRs
+ */
+#define	MSR_VMX_BASIC		0x480
+#define	MSR_VMX_PINBASED_CTLS	0x481
+#define	MSR_VMX_PROCBASED_CTLS	0x482
+#define	MSR_VMX_EXIT_CTLS	0x483
+#define	MSR_VMX_ENTRY_CTLS	0x484
+#define	MSR_VMX_CR0_FIXED0	0x486
+#define	MSR_VMX_CR0_FIXED1	0x487
+#define	MSR_VMX_CR4_FIXED0	0x488
+#define	MSR_VMX_CR4_FIXED1	0x489
+#define	MSR_VMX_PROCBASED_CTLS2	0x48b
+#define	MSR_VMX_EPT_VPID_CAP	0x48c
+#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48d
+#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48e
+#define	MSR_VMX_TRUE_EXIT_CTLS	0x48f
+#define	MSR_VMX_TRUE_ENTRY_CTLS	0x490
+
+/*
  * X2APIC MSRs
  */
 #define	MSR_APIC_ID		0x802
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 23e16cb..1c95f77 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -35,6 +35,7 @@ SRCS=	\
 	post.c			\
 	rtc.c			\
 	smbiostbl.c		\
+	task_switch.c		\
 	uart_emul.c		\
 	virtio.c		\
 	xmsr.c			\
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
index c4ec020..5dea300 100644
--- a/usr.sbin/bhyve/acpi.c
+++ b/usr.sbin/bhyve/acpi.c
@@ -40,12 +40,13 @@
  *  Layout
  *  ------
  *   RSDP  ->   0xf2400    (36 bytes fixed)
- *     RSDT  ->   0xf2440    (36 bytes + 4*N table addrs, 2 used)
- *     XSDT  ->   0xf2480    (36 bytes + 8*N table addrs, 2 used)
+ *     RSDT  ->   0xf2440    (36 bytes + 4*7 table addrs, 4 used)
+ *     XSDT  ->   0xf2480    (36 bytes + 8*7 table addrs, 4 used)
  *       MADT  ->   0xf2500  (depends on #CPUs)
  *       FADT  ->   0xf2600  (268 bytes)
  *       HPET  ->   0xf2740  (56 bytes)
- *         FACS  ->   0xf2780 (64 bytes)
+ *       MCFG  ->   0xf2780  (60 bytes)
+ *         FACS  ->   0xf27C0 (64 bytes)
  *         DSDT  ->   0xf2800 (variable - can go up to 0x100000)
  */
 
@@ -80,7 +81,8 @@ __FBSDID("$FreeBSD$");
 #define MADT_OFFSET		0x100
 #define FADT_OFFSET		0x200
 #define	HPET_OFFSET		0x340
-#define FACS_OFFSET		0x380
+#define	MCFG_OFFSET		0x380
+#define FACS_OFFSET		0x3C0
 #define DSDT_OFFSET		0x400
 
 #define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX"
@@ -178,6 +180,8 @@ basl_fwrite_rsdt(FILE *fp)
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
 	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
@@ -216,6 +220,8 @@ basl_fwrite_xsdt(FILE *fp)
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
 	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
@@ -583,6 +589,39 @@ err_exit:
 }
 
 static int
+basl_fwrite_mcfg(FILE *fp)
+{
+	int err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve MCFG template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
+	EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
+	EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
+	EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
+	EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
+	EFFLUSH(fp);
+	return (0);
+err_exit:
+	return (errno);
+}
+
+static int
 basl_fwrite_facs(FILE *fp)
 {
 	int err;
@@ -921,6 +960,7 @@ static struct {
 	{ basl_fwrite_madt, MADT_OFFSET },
 	{ basl_fwrite_fadt, FADT_OFFSET },
 	{ basl_fwrite_hpet, HPET_OFFSET },
+	{ basl_fwrite_mcfg, MCFG_OFFSET },
 	{ basl_fwrite_facs, FACS_OFFSET },
 	{ basl_fwrite_dsdt, DSDT_OFFSET },
 	{ NULL }
diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c
index 6e13c19..930b7af 100644
--- a/usr.sbin/bhyve/atkbdc.c
+++ b/usr.sbin/bhyve/atkbdc.c
@@ -31,6 +31,10 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/vmm.h>
 
+#include <vmmapi.h>
+
+#include <assert.h>
+#include <errno.h>
 #include <stdio.h>
 
 #include "inout.h"
@@ -48,29 +52,30 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
 	if (bytes != 1)
-		return (INOUT_ERROR);
+		return (-1);
 
 	*eax = 0;
 
-	return (INOUT_OK);
+	return (0);
 }
 
 static int
 atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
     int bytes, uint32_t *eax, void *arg)
 {
-	int retval;
+	int error, retval;
 
 	if (bytes != 1)
-		return (INOUT_ERROR);
+		return (-1);
 
-	retval = INOUT_OK;
+	retval = 0;
 	if (in) {
 		*eax = KBD_SYS_FLAG;	/* system passed POST */
 	} else {
 		switch (*eax) {
 		case KBDC_RESET:	/* Pulse "reset" line. */
-			retval = INOUT_RESET;
+			error = vm_suspend(ctx, VM_SUSPEND_RESET);
+			assert(error == 0 || errno == EALREADY);
 			break;
 		}
 	}
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index aad1aef..80b814a 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd April 2, 2014
+.Dd June 26, 2014
 .Dt BHYVE 8
 .Os
 .Sh NAME
@@ -32,12 +32,14 @@
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
-.Op Fl aehwxACHPW
+.Op Fl abehwxACHPWY
 .Op Fl c Ar numcpus
 .Op Fl g Ar gdbport
+.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
+.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
 .Op Fl p Ar vcpu:hostcpu
 .Op Fl s Ar slot,emulation Ns Op , Ns Ar conf
-.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
+.Op Fl U Ar uuid
 .Ar vmname
 .Sh DESCRIPTION
 .Nm
@@ -66,21 +68,49 @@ Generate ACPI tables.
 Required for
 .Fx Ns /amd64
 guests.
+.It Fl b
+Enable a low-level console device supported by
+.Fx kernels compiled with
+.Cd "device bvmconsole" .
+This option will be deprecated in a future version.
 .It Fl c Ar numcpus
 Number of guest virtual CPUs.
 The default is 1 and the maximum is 16.
 .It Fl C
 Include guest memory in core file.
-.It Fl H
-Yield the virtual CPU thread when a HLT instruction is detected.
-If this option is not specified, virtual CPUs will use 100% of a host CPU.
+.It Fl e
+Force
+.Nm
+to exit when a guest issues an access to an I/O port that is not emulated.
+This is intended for debug purposes.
 .It Fl g Ar gdbport
 For
-.Fx Ns /amd64 kernels compiled with
-.Cd "option bvmdebug" ,
+.Fx
+kernels compiled with
+.Cd "device bvmdebug" ,
 allow a remote kernel kgdb to be relayed to the guest kernel gdb stub
 via a local IPv4 address and this port.
 This option will be deprecated in a future version.
+.It Fl h
+Print help message and exit.
+.It Fl H
+Yield the virtual CPU thread when a HLT instruction is detected.
+If this option is not specified, virtual CPUs will use 100% of a host CPU.
+.It Fl l Ar lpcdev Ns Op , Ns Ar conf
+Allow devices behind the LPC PCI-ISA bridge to be configured.
+The only supported devices are the TTY-class devices,
+.Li com1
+and
+.Li com2 .
+.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
+Guest physical memory size in bytes.
+This must be the same size that was given to
+.Xr bhyveload 8 .
+.Pp
+The size argument may be suffixed with one of K, M, G or T (either upper
+or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
+or terabytes.
+If no suffix is given, the value is assumed to be in megabytes.
 .It Fl p Ar vcpu:hostcpu
 Pin guest's virtual CPU
 .Em vcpu
@@ -88,9 +118,6 @@ to
 .Em hostcpu .
 .It Fl P
 Force the guest virtual CPU to exit when a PAUSE instruction is detected.
-.It Fl W
-Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
-interrupts.
 .It Fl s Ar slot,emulation Ns Op , Ns Ar conf
 Configure a virtual PCI slot and function.
 .Pp
@@ -211,34 +238,21 @@ The host device must have been reserved at boot-time using the
 loader variable as described in
 .Xr vmm 4 .
 .El
-.It Fl l Ar lpcdev Ns Op , Ns Ar conf
-Allow devices behind the LPC PCI-ISA bridge to be configured.
-The only supported devices are the TTY-class devices,
-.Li com1
-and
-.Li com2 .
-.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
-Guest physical memory size in bytes.
-This must be the same size that was given to
-.Xr bhyveload 8 .
-.Pp
-The size argument may be suffixed with one of K, M, G or T (either upper
-or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
-or terabytes.
-If no suffix is given, the value is assumed to be in megabytes.
-.It Fl e
-Force
-.Nm
-to exit when a guest issues an access to an I/O port that is not emulated.
-This is intended for debug purposes.
+.It Fl U Ar uuid
+Set the universally unique identifier
+.Pq UUID
+in the guest's System Management BIOS System Information structure.
+By default a UUID is generated from the host's hostname and
+.Ar vmname .
 .It Fl w
 Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes.
+.It Fl W
+Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
+interrupts.
 .It Fl x
 The guest's local APIC is configured in x2APIC mode.
 .It Fl Y
 Disable MPtable generation.
-.It Fl h
-Print help message and exit.
 .It Ar vmname
 Alphanumeric name of the guest.
 This should be the same as that created by
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 1e5d3b3..7dcf6d0 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -69,16 +69,11 @@ __FBSDID("$FreeBSD$");
 
 #define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
 
-#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
-#define	VMEXIT_RESTART		2	/* restart current instruction */
-#define	VMEXIT_ABORT		3	/* abort the vm run loop */
-#define	VMEXIT_RESET		4	/* guest machine has reset */
-#define	VMEXIT_POWEROFF		5	/* guest machine has powered off */
-
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
 char *vmname;
 
@@ -101,7 +96,7 @@ static cpuset_t cpumask;
 
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 
-struct vm_exit vmexit[VM_MAXCPU];
+static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
         uint64_t        vmexit_bogus;
@@ -112,8 +107,6 @@ struct bhyvestats {
         uint64_t        vmexit_inst_emul;
         uint64_t        cpu_switch_rotate;
         uint64_t        cpu_switch_direct;
-        int             io_reset;
-	int		io_poweroff;
 } stats;
 
 struct mt_vmm_info {
@@ -129,26 +122,26 @@ usage(int code)
 {
 
         fprintf(stderr,
-                "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n"
-		"       %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+                "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
+		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
-		"       -A: create an ACPI table\n"
-		"       -g: gdb port\n"
+		"       -A: create ACPI tables\n"
 		"       -c: # cpus (default 1)\n"
 		"       -C: include guest memory in core file\n"
-		"       -p: pin 'vcpu' to 'hostcpu'\n"
-		"       -H: vmexit from the guest on hlt\n"
-		"       -P: vmexit from the guest on pause\n"
-		"       -W: force virtio to use single-vector MSI\n"
 		"       -e: exit on unhandled I/O access\n"
+		"       -g: gdb port\n"
 		"       -h: help\n"
-		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -H: vmexit from the guest on hlt\n"
 		"       -l: LPC device configuration\n"
 		"       -m: memory size in MB\n"
+		"       -p: pin 'vcpu' to 'hostcpu'\n"
+		"       -P: vmexit from the guest on pause\n"
+		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
+		"       -W: force virtio to use single-vector MSI\n"
 		"       -x: local apic is in x2APIC mode\n"
-		"       -Y: disable MPtable generation\n"
-		"       -U: uuid\n",
+		"       -Y: disable MPtable generation\n",
 		progname, (int)strlen(progname), "");
 
 	exit(code);
@@ -187,6 +180,27 @@ pincpu_parse(const char *opt)
 	return (0);
 }
 
+void
+vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
+    int errcode)
+{
+	struct vmctx *ctx;
+	int error;
+
+	ctx = arg;
+	if (errcode_valid)
+		error = vm_inject_exception2(ctx, vcpu, vector, errcode);
+	else
+		error = vm_inject_exception(ctx, vcpu, vector);
+	assert(error == 0);
+
+	/*
+	 * Set the instruction length to 0 to ensure that the instruction is
+	 * restarted when the fault handler returns.
+	 */
+	vmexit[vcpu].inst_length = 0;
+}
+
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
@@ -315,27 +329,18 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 	}
 
 	error = emulate_inout(ctx, vcpu, vme, strictio);
-	if (error == INOUT_OK && in && !string) {
+	if (!error && in && !string) {
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
 		    vme->u.inout.eax);
+		assert(error == 0);
 	}
 
-	switch (error) {
-	case INOUT_OK:
-		return (VMEXIT_CONTINUE);
-	case INOUT_RESTART:
-		return (VMEXIT_RESTART);
-	case INOUT_RESET:
-		stats.io_reset++;
-		return (VMEXIT_RESET);
-	case INOUT_POWEROFF:
-		stats.io_poweroff++;
-		return (VMEXIT_POWEROFF);
-	default:
-		fprintf(stderr, "Unhandled %s%c 0x%04x\n",
-			in ? "in" : "out",
-			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+	if (error) {
+		fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out",
+		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
 		return (VMEXIT_ABORT);
+	} else {
+		return (VMEXIT_CONTINUE);
 	}
 }
 
@@ -352,8 +357,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
-			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
-			assert(error == 0);
+			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
@@ -379,8 +383,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
-			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
-			assert(error == 0);
+			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
@@ -399,6 +402,16 @@ vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 	return (retval);
 }
 
+#define	DEBUG_EPT_MISCONFIG
+#ifdef DEBUG_EPT_MISCONFIG
+#define	EXIT_REASON_EPT_MISCONFIG	49
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+#define	VMCS_IDENT(x)			((x) | 0x80000000)
+
+static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
+static int ept_misconfig_ptenum;
+#endif
+
 static int
 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
@@ -413,7 +426,21 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	    vmexit->u.vmx.exit_qualification);
 	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
-
+#ifdef DEBUG_EPT_MISCONFIG
+	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+		vm_get_register(ctx, *pvcpu,
+		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
+		    &ept_misconfig_gpa);
+		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
+		    &ept_misconfig_ptenum);
+		fprintf(stderr, "\tEPT misconfiguration:\n");
+		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
+		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
+		    ept_misconfig_ptenum, ept_misconfig_pte[0],
+		    ept_misconfig_pte[1], ept_misconfig_pte[2],
+		    ept_misconfig_pte[3]);
+	}
+#endif	/* DEBUG_EPT_MISCONFIG */
 	return (VMEXIT_ABORT);
 }
 
@@ -465,7 +492,7 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	stats.vmexit_inst_emul++;
 
 	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
-			  &vmexit->u.inst_emul.vie);
+	    &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging);
 
 	if (err) {
 		if (err == EINVAL) {
@@ -515,6 +542,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
+	case VM_SUSPEND_TRIPLEFAULT:
+		exit(3);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
@@ -532,7 +561,8 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
-	[VM_EXITCODE_SUSPENDED] = vmexit_suspend
+	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
+	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 };
 
 static void
@@ -540,7 +570,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 {
 	int error, rc, prevcpu;
 	enum vm_exitcode exitcode;
-	enum vm_suspend_how how;
 	cpuset_t active_cpus;
 
 	if (vcpumap[vcpu] != NULL) {
@@ -575,16 +604,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 		case VMEXIT_RESTART:
                         rip = vmexit[vcpu].rip;
 			break;
-		case VMEXIT_RESET:
-		case VMEXIT_POWEROFF:
-			if (rc == VMEXIT_RESET)
-				how = VM_SUSPEND_RESET;
-			else
-				how = VM_SUSPEND_POWEROFF;
-			error = vm_suspend(ctx, how);
-			assert(error == 0 || errno == EALREADY);
-                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
-			break;
 		case VMEXIT_ABORT:
 			abort();
 		default:
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
index f18d42f..87824ef 100644
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -35,6 +35,10 @@
 #define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]
 #endif
 
+#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
+#define	VMEXIT_RESTART		2	/* restart current instruction */
+#define	VMEXIT_ABORT		3	/* abort the vm run loop */
+
 struct vmctx;
 extern int guest_ncpus;
 extern char *guest_uuid_str;
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index b29bc78..1ec0344 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -390,6 +390,55 @@ blockif_close(struct blockif_ctxt *bc)
 }
 
 /*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+	off_t sectors;		/* total sectors of the block dev */
+	off_t hcyl;		/* cylinders times heads */
+	uint16_t secpt;		/* sectors per track */
+	uint8_t heads;
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+
+	sectors = bc->bc_size / bc->bc_sectsz;
+
+	/* Clamp the size to the largest possible with CHS */
+	if (sectors > 65535UL*16*255)
+		sectors = 65535UL*16*255;
+
+	if (sectors >= 65536UL*16*63) {
+		secpt = 255;
+		heads = 16;
+		hcyl = sectors / secpt;
+	} else {
+		secpt = 17;
+		hcyl = sectors / secpt;
+		heads = (hcyl + 1023) / 1024;
+
+		if (heads < 4)
+			heads = 4;
+
+		if (hcyl >= (heads * 1024) || heads > 16) {
+			secpt = 31;
+			heads = 16;
+			hcyl = sectors / secpt;
+		}
+		if (hcyl >= (heads * 1024)) {
+			secpt = 63;
+			heads = 16;
+			hcyl = sectors / secpt;
+		}
+	}
+
+	*c = hcyl / heads;
+	*h = heads;
+	*s = secpt;
+}
+
+/*
  * Accessors
  */
 off_t
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index e0c0bb1..c2c21f6 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -52,6 +52,8 @@ struct blockif_req {
 struct blockif_ctxt;
 struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
 off_t	blockif_size(struct blockif_ctxt *bc);
+void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+    uint8_t *s);
 int	blockif_sectsz(struct blockif_ctxt *bc);
 int	blockif_queuesz(struct blockif_ctxt *bc);
 int	blockif_is_ro(struct blockif_ctxt *bc);
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index fe9e0d8..1041a59 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -154,31 +154,28 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 		/* Limit number of back-to-back in/out emulations to 16 */
 		iterations = MIN(count, 16);
 		while (iterations > 0) {
+			assert(retval == 0);
 			if (vie_calculate_gla(vis->paging.cpu_mode,
 			    vis->seg_name, &vis->seg_desc, index, bytes,
 			    addrsize, prot, &gla)) {
-				error = vm_inject_exception2(ctx, vcpu,
-				    IDT_GP, 0);
-				assert(error == 0);
-				retval = INOUT_RESTART;
+				vm_inject_gp(ctx, vcpu);
 				break;
 			}
 
-			error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes,
-			    prot, iov, nitems(iov));
-			assert(error == 0 || error == 1 || error == -1);
-			if (error) {
-				retval = (error == 1) ? INOUT_RESTART :
-				    INOUT_ERROR;
+			error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
+			    bytes, prot, iov, nitems(iov));
+			if (error == -1) {
+				retval = -1;  /* Unrecoverable error */
+				break;
+			} else if (error == 1) {
+				retval = 0;  /* Resume guest to handle fault */
 				break;
 			}
 
 			if (vie_alignment_check(vis->paging.cpl, bytes,
 			    vis->cr0, vis->rflags, gla)) {
-				error = vm_inject_exception2(ctx, vcpu,
-				    IDT_AC, 0);
-				assert(error == 0);
-				return (INOUT_RESTART);
+				vm_inject_ac(ctx, vcpu, 0);
+				break;
 			}
 
 			val = 0;
@@ -217,8 +214,8 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 		}
 
 		/* Restart the instruction if more iterations remain */
-		if (retval == INOUT_OK && count != 0)
-			retval = INOUT_RESTART;
+		if (retval == 0 && count != 0)
+			vmexit->inst_length = 0;
 	} else {
 		if (!in) {
 			val = vmexit->u.inout.eax & vie_size2mask(bytes);
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
index f15a2c8..7f39095 100644
--- a/usr.sbin/bhyve/inout.h
+++ b/usr.sbin/bhyve/inout.h
@@ -34,13 +34,9 @@
 struct vmctx;
 struct vm_exit;
 
-/* Handler return values. */
-#define	INOUT_ERROR	-1
-#define	INOUT_OK	0
-#define	INOUT_RESTART	1
-#define	INOUT_RESET	2
-#define	INOUT_POWEROFF	3
-
+/*
+ * inout emulation handlers return 0 on success and -1 on failure.
+ */
 typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
 			    int bytes, uint32_t *eax, void *arg);
 
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
index 7ea630f..2a9f430 100644
--- a/usr.sbin/bhyve/mem.c
+++ b/usr.sbin/bhyve/mem.c
@@ -157,10 +157,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
 }
 
 int
-emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+    struct vm_guest_paging *paging)
+
 {
 	struct mmio_rb_range *entry;
-	int err;
+	int err, immutable;
 	
 	pthread_rwlock_rdlock(&mmio_rwlock);
 	/*
@@ -184,10 +186,28 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
 	}
 
 	assert(entry != NULL);
-	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+
+	/*
+	 * An 'immutable' memory range is guaranteed to be never removed
+	 * so there is no need to hold 'mmio_rwlock' while calling the
+	 * handler.
+	 *
+	 * XXX writes to the PCIR_COMMAND register can cause register_mem()
+	 * to be called. If the guest is using PCI extended config space
+	 * to modify the PCIR_COMMAND register then register_mem() can
+	 * deadlock on 'mmio_rwlock'. However by registering the extended
+	 * config space window as 'immutable' the deadlock can be avoided.
+	 */
+	immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
+	if (immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
+
+	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
 				      mem_read, mem_write, &entry->mr_param);
-	pthread_rwlock_unlock(&mmio_rwlock);
-	
+
+	if (!immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
+
 	return (err);
 }
 
@@ -244,6 +264,7 @@ unregister_mem(struct mem_range *memp)
 		mr = &entry->mr_param;
 		assert(mr->name == memp->name);
 		assert(mr->base == memp->base && mr->size == memp->size); 
+		assert((mr->flags & MEM_F_IMMUTABLE) == 0);
 		RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
 
 		/* flush Per-vCPU cache */	
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
index 264bff9..f671eae 100644
--- a/usr.sbin/bhyve/mem.h
+++ b/usr.sbin/bhyve/mem.h
@@ -48,9 +48,11 @@ struct mem_range {
 #define	MEM_F_READ		0x1
 #define	MEM_F_WRITE		0x2
 #define	MEM_F_RW		0x3
+#define	MEM_F_IMMUTABLE		0x4	/* mem_range cannot be unregistered */
 
 void	init_mem(void);
-int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
+int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
+		    struct vm_guest_paging *paging);
 		    
 int	register_mem(struct mem_range *memp);
 int	register_mem_fallback(struct mem_range *memp);
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 9f61107..214237d 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -336,8 +336,9 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 	fis[13] = cfis[13];
 	if (fis[2] & ATA_S_ERROR)
 		p->is |= AHCI_P_IX_TFE;
+	else
+		p->ci &= ~(1 << slot);
 	p->tfd = tfd;
-	p->ci &= ~(1 << slot);
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
@@ -598,10 +599,16 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 	} else {
 		uint16_t buf[256];
 		uint64_t sectors;
+		uint16_t cyl;
+		uint8_t sech, heads;
 
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+		blockif_chs(p->bctx, &cyl, &heads, &sech);
 		memset(buf, 0, sizeof(buf));
 		buf[0] = 0x0040;
+		buf[1] = cyl;
+		buf[3] = heads;
+		buf[6] = sech;
 		/* TODO emulate different serial? */
 		ata_string((uint8_t *)(buf+10), "123456", 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
@@ -645,8 +652,8 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		p->tfd = ATA_S_DSC | ATA_S_READY;
 		p->is |= AHCI_P_IX_DP;
+		p->ci &= ~(1 << slot);
 	}
-	p->ci &= ~(1 << slot);
 	ahci_generate_intr(p->pr_sc);
 }
 
@@ -688,8 +695,8 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		p->tfd = ATA_S_DSC | ATA_S_READY;
 		p->is |= AHCI_P_IX_DHR;
+		p->ci &= ~(1 << slot);
 	}
-	p->ci &= ~(1 << slot);
 	ahci_generate_intr(p->pr_sc);
 }
 
@@ -1292,7 +1299,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 		if (!p->atapi) {
 			p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 			p->is |= AHCI_P_IX_TFE;
-			p->ci &= ~(1 << slot);
 			ahci_generate_intr(p->pr_sc);
 		} else
 			handle_packet_cmd(p, slot, cfis);
@@ -1301,7 +1307,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 		WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
 		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 		p->is |= AHCI_P_IX_TFE;
-		p->ci &= ~(1 << slot);
 		ahci_generate_intr(p->pr_sc);
 		break;
 	}
@@ -1369,8 +1374,11 @@ ahci_handle_port(struct ahci_port *p)
 	 * are already in-flight.
 	 */
 	for (i = 0; (i < 32) && p->ci; i++) {
-		if ((p->ci & (1 << i)) && !(p->pending & (1 << i)))
+		if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) {
+			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
+			p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;
 			ahci_handle_slot(p, i);
+		}
 	}
 }
 
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 458ba76..6b906ed 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -109,16 +109,20 @@ static uint64_t pci_emul_membase64;
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
-#define	PCI_EMUL_MEMLIMIT32	0xE0000000	/* 3.5GB */
+#define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
+#define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+
+#define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE
 
 #define	PCI_EMUL_MEMBASE64	0xD000000000UL
 #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
 
 static struct pci_devemu *pci_emul_finddev(char *name);
-static void	pci_lintr_route(struct pci_devinst *pi);
-static void	pci_lintr_update(struct pci_devinst *pi);
-
-static struct mem_range pci_mem_hole;
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
+    int func, int coff, int bytes, uint32_t *val);
 
 /*
  * I/O access
@@ -1023,12 +1027,37 @@ pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 	return (0);
 }
 
+static int
+pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+    int bytes, uint64_t *val, void *arg1, long arg2)
+{
+	int bus, slot, func, coff, in;
+
+	coff = addr & 0xfff;
+	func = (addr >> 12) & 0x7;
+	slot = (addr >> 15) & 0x1f;
+	bus = (addr >> 20) & 0xff;
+	in = (dir == MEM_F_READ);
+	if (in)
+		*val = ~0UL;
+	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
+	return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+	return (PCI_EMUL_ECFG_BASE);
+}
+
 #define	BUSIO_ROUNDUP		32
 #define	BUSMEM_ROUNDUP		(1024 * 1024)
 
 int
 init_pci(struct vmctx *ctx)
 {
+	struct mem_range mr;
 	struct pci_devemu *pde;
 	struct businfo *bi;
 	struct slotinfo *si;
@@ -1112,22 +1141,34 @@ init_pci(struct vmctx *ctx)
 	 * The guest physical memory map looks like the following:
 	 * [0,		    lowmem)		guest system memory
 	 * [lowmem,	    lowmem_limit)	memory hole (may be absent)
-	 * [lowmem_limit,   4GB)		PCI hole (32-bit BAR allocation)
+	 * [lowmem_limit,   0xE0000000)		PCI hole (32-bit BAR allocation)
+	 * [0xE0000000,	    0xF0000000)		PCI extended config window
+	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware
 	 * [4GB,	    4GB + highmem)
-	 *
+	 */
+
+	/*
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
 	lowmem = vm_get_lowmem_size(ctx);
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI hole";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = lowmem;
+	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+	mr.handler = pci_emul_fallback_handler;
+	error = register_mem_fallback(&mr);
+	assert(error == 0);
 
-	memset(&pci_mem_hole, 0, sizeof(struct mem_range));
-	pci_mem_hole.name = "PCI hole";
-	pci_mem_hole.flags = MEM_F_RW;
-	pci_mem_hole.base = lowmem;
-	pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
-	pci_mem_hole.handler = pci_emul_fallback_handler;
-
-	error = register_mem_fallback(&pci_mem_hole);
+	/* PCI extended config space */
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI ECFG";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = PCI_EMUL_ECFG_BASE;
+	mr.size = PCI_EMUL_ECFG_SIZE;
+	mr.handler = pci_emul_ecfg_handler;
+	error = register_mem(&mr);
 	assert(error == 0);
 
 	return (0);
@@ -1612,41 +1653,6 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
 	}
 }
 
-static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
-
-static int
-pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
-{
-	uint32_t x;
-
-	if (bytes != 4) {
-		if (in)
-			*eax = (bytes == 2) ? 0xffff : 0xff;
-		return (0);
-	}
-
-	if (in) {
-		x = (cfgbus << 16) |
-		    (cfgslot << 11) |
-		    (cfgfunc << 8) |
-		    cfgoff;
-                if (cfgenable)
-			x |= CONF1_ENABLE;	       
-		*eax = x;
-	} else {
-		x = *eax;
-		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
-		cfgoff = x & PCI_REGMAX;
-		cfgfunc = (x >> 8) & PCI_FUNCMAX;
-		cfgslot = (x >> 11) & PCI_SLOTMAX;
-		cfgbus = (x >> 16) & PCI_BUSMAX;
-	}
-
-	return (0);
-}
-INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
-
 static uint32_t
 bits_changed(uint32_t old, uint32_t new, uint32_t mask)
 {
@@ -1709,41 +1715,51 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
 	pci_lintr_update(pi);
 }	
 
-static int
-pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
+static void
+pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
+    int coff, int bytes, uint32_t *eax)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
-	int coff, idx, needcfg;
+	int idx, needcfg;
 	uint64_t addr, bar, mask;
 
-	assert(bytes == 1 || bytes == 2 || bytes == 4);
-
-	if ((bi = pci_businfo[cfgbus]) != NULL) {
-		si = &bi->slotinfo[cfgslot];
-		pi = si->si_funcs[cfgfunc].fi_devi;
+	if ((bi = pci_businfo[bus]) != NULL) {
+		si = &bi->slotinfo[slot];
+		pi = si->si_funcs[func].fi_devi;
 	} else
 		pi = NULL;
 
-	coff = cfgoff + (port - CONF1_DATA_PORT);
-
-#if 0
-	printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
-		in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
-#endif
-
 	/*
-	 * Just return if there is no device at this cfgslot:cfgfunc,
-	 * if the guest is doing an un-aligned access, or if the config
-	 * address word isn't enabled.
+	 * Just return if there is no device at this slot:func or if the
+	 * the guest is doing an un-aligned access.
 	 */
-	if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) {
+	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+	    (coff & (bytes - 1)) != 0) {
 		if (in)
 			*eax = 0xffffffff;
-		return (0);
+		return;
+	}
+
+	/*
+	 * Ignore all writes beyond the standard config space and return all
+	 * ones on reads.
+	 */
+	if (coff >= PCI_REGMAX + 1) {
+		if (in) {
+			*eax = 0xffffffff;
+			/*
+			 * Extended capabilities begin at offset 256 in config
+			 * space. Absence of extended capabilities is signaled
+			 * with all 0s in the extended capability header at
+			 * offset 256.
+			 */
+			if (coff <= PCI_REGMAX + 4)
+				*eax = 0x00000000;
+		}
+		return;
 	}
 
 	pe = pi->pi_d;
@@ -1754,8 +1770,8 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 	if (in) {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgread != NULL) {
-			needcfg = pe->pe_cfgread(ctx, vcpu, pi,
-						    coff, bytes, eax);
+			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
+			    eax);
 		} else {
 			needcfg = 1;
 		}
@@ -1769,12 +1785,12 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 				*eax = pci_get_cfgdata32(pi, coff);
 		}
 
-		pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax);
+		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
 	} else {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgwrite != NULL &&
 		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
-			return (0);
+			return;
 
 		/*
 		 * Special handling for write to BAR registers
@@ -1785,7 +1801,7 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
-				return (0);
+				return;
 			idx = (coff - PCIR_BAR(0)) / 4;
 			mask = ~(pi->pi_bar[idx].size - 1);
 			switch (pi->pi_bar[idx].type) {
@@ -1843,7 +1859,57 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
 	}
+}
+
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	uint32_t x;
+
+	if (bytes != 4) {
+		if (in)
+			*eax = (bytes == 2) ? 0xffff : 0xff;
+		return (0);
+	}
+
+	if (in) {
+		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+		if (cfgenable)
+			x |= CONF1_ENABLE;
+		*eax = x;
+	} else {
+		x = *eax;
+		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+		cfgoff = x & PCI_REGMAX;
+		cfgfunc = (x >> 8) & PCI_FUNCMAX;
+		cfgslot = (x >> 11) & PCI_SLOTMAX;
+		cfgbus = (x >> 16) & PCI_BUSMAX;
+	}
+
+	return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	int coff;
 
+	assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+	coff = cfgoff + (port - CONF1_DATA_PORT);
+	if (cfgenable) {
+		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
+		    eax);
+	} else {
+		/* Ignore accesses to cfgdata if not enabled by cfgaddr */
+		if (in)
+			*eax = 0xffffffff;
+	}
 	return (0);
 }
 
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index 866ffc5..6b8c4e0 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -235,6 +235,7 @@ uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
 int	pci_count_lintr(int bus);
 void	pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
 void	pci_write_dsdt(void);
+uint64_t pci_ecfg_base(void);
 int	pci_bus_configured(int bus);
 
 static __inline void 
diff --git a/usr.sbin/bhyve/pci_irq.c b/usr.sbin/bhyve/pci_irq.c
index 653aeb0..20e033f 100644
--- a/usr.sbin/bhyve/pci_irq.c
+++ b/usr.sbin/bhyve/pci_irq.c
@@ -115,7 +115,7 @@ void
 pci_irq_reserve(int irq)
 {
 
-	assert(irq < nitems(irq_counts));
+	assert(irq >= 0 && irq < nitems(irq_counts));
 	assert(pirq_cold);
 	assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
 	irq_counts[irq] = IRQ_DISABLED;
@@ -125,10 +125,10 @@ void
 pci_irq_use(int irq)
 {
 
-	assert(irq < nitems(irq_counts));
+	assert(irq >= 0 && irq < nitems(irq_counts));
 	assert(pirq_cold);
-	if (irq_counts[irq] != IRQ_DISABLED)
-		irq_counts[irq]++;
+	assert(irq_counts[irq] != IRQ_DISABLED);
+	irq_counts[irq]++;
 }
 
 void
@@ -197,7 +197,7 @@ pirq_alloc_pin(struct vmctx *ctx)
 {
 	int best_count, best_irq, best_pin, irq, pin;
 
-	pirq_cold = 1;
+	pirq_cold = 0;
 
 	/* First, find the least-used PIRQ pin. */
 	best_pin = 0;
@@ -222,7 +222,7 @@ pirq_alloc_pin(struct vmctx *ctx)
 				best_count = irq_counts[irq];
 			}
 		}
-		assert(best_irq != 0);
+		assert(best_irq >= 0);
 		irq_counts[best_irq]++;
 		pirqs[best_pin].reg = best_irq;
 		vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
@@ -234,9 +234,6 @@ pirq_alloc_pin(struct vmctx *ctx)
 int
 pirq_irq(int pin)
 {
-
-	if (pin == -1)
-		return (255);
 	assert(pin > 0 && pin <= nitems(pirqs));
 	return (pirqs[pin - 1].reg & PIRQ_IRQ);
 }
diff --git a/usr.sbin/bhyve/pm.c b/usr.sbin/bhyve/pm.c
index 67126d8..f5a2d43 100644
--- a/usr.sbin/bhyve/pm.c
+++ b/usr.sbin/bhyve/pm.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmm.h>
 
 #include <assert.h>
+#include <errno.h>
 #include <pthread.h>
 #include <signal.h>
 #include <vmmapi.h>
@@ -56,6 +57,8 @@ static int
 reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
+	int error;
+
 	static uint8_t reset_control;
 
 	if (bytes != 1)
@@ -66,8 +69,10 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		reset_control = *eax;
 
 		/* Treat hard and soft resets the same. */
-		if (reset_control & 0x4)
-			return (INOUT_RESET);
+		if (reset_control & 0x4) {
+			error = vm_suspend(ctx, VM_SUSPEND_RESET);
+			assert(error == 0 || errno == EALREADY);
+		}
 	}
 	return (0);
 }
@@ -224,6 +229,7 @@ static int
 pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
+	int error;
 
 	if (bytes != 2)
 		return (-1);
@@ -243,8 +249,10 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 * says that '5' should be stored in SLP_TYP for S5.
 		 */
 		if (*eax & PM1_SLP_EN) {
-			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5)
-				return (INOUT_POWEROFF);
+			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
+				error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
+				assert(error == 0 || errno == EALREADY);
+			}
 		}
 	}
 	return (0);
diff --git a/usr.sbin/bhyve/smbiostbl.c b/usr.sbin/bhyve/smbiostbl.c
index d560f02..28c7eb2 100644
--- a/usr.sbin/bhyve/smbiostbl.c
+++ b/usr.sbin/bhyve/smbiostbl.c
@@ -321,8 +321,8 @@ struct smbios_table_type0 smbios_type0_template = {
 
 const char *smbios_type0_strings[] = {
 	"BHYVE",	/* vendor string */
-	__TIME__,	/* bios version string */
-	__DATE__,	/* bios release date string */
+	"1.00",		/* bios version string */
+	"03/14/2014",	/* bios release date string */
 	NULL
 };
 
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
new file mode 100644
index 0000000..0002da8
--- /dev/null
+++ b/usr.sbin/bhyve/task_switch.c
@@ -0,0 +1,932 @@
+/*-
+ * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+#include <x86/specialreg.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <errno.h>
+
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+
+/*
+ * Using 'struct i386tss' is tempting but causes myriad sign extension
+ * issues because all of its fields are defined as signed integers.
+ */
+struct tss32 {
+	uint16_t	tss_link;
+	uint16_t	rsvd1;
+	uint32_t	tss_esp0;
+	uint16_t	tss_ss0;
+	uint16_t	rsvd2;
+	uint32_t	tss_esp1;
+	uint16_t	tss_ss1;
+	uint16_t	rsvd3;
+	uint32_t	tss_esp2;
+	uint16_t	tss_ss2;
+	uint16_t	rsvd4;
+	uint32_t	tss_cr3;
+	uint32_t	tss_eip;
+	uint32_t	tss_eflags;
+	uint32_t	tss_eax;
+	uint32_t	tss_ecx;
+	uint32_t	tss_edx;
+	uint32_t	tss_ebx;
+	uint32_t	tss_esp;
+	uint32_t	tss_ebp;
+	uint32_t	tss_esi;
+	uint32_t	tss_edi;
+	uint16_t	tss_es;
+	uint16_t	rsvd5;
+	uint16_t	tss_cs;
+	uint16_t	rsvd6;
+	uint16_t	tss_ss;
+	uint16_t	rsvd7;
+	uint16_t	tss_ds;
+	uint16_t	rsvd8;
+	uint16_t	tss_fs;
+	uint16_t	rsvd9;
+	uint16_t	tss_gs;
+	uint16_t	rsvd10;
+	uint16_t	tss_ldt;
+	uint16_t	rsvd11;
+	uint16_t	tss_trap;
+	uint16_t	tss_iomap;
+};
+CTASSERT(sizeof(struct tss32) == 104);
+
+#define	SEL_START(sel)	(((sel) & ~0x7))
+#define	SEL_LIMIT(sel)	(((sel) | 0x7))
+#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
+
+static uint64_t
+GETREG(struct vmctx *ctx, int vcpu, int reg)
+{
+	uint64_t val;
+	int error;
+
+	error = vm_get_register(ctx, vcpu, reg, &val);
+	assert(error == 0);
+	return (val);
+}
+
+static void
+SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+	int error;
+
+	error = vm_set_register(ctx, vcpu, reg, val);
+	assert(error == 0);
+}
+
+static struct seg_desc
+usd_to_seg_desc(struct user_segment_descriptor *usd)
+{
+	struct seg_desc seg_desc;
+
+	seg_desc.base = (u_int)USD_GETBASE(usd);
+	if (usd->sd_gran)
+		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
+	else
+		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
+	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
+	seg_desc.access |= usd->sd_xx << 12;
+	seg_desc.access |= usd->sd_def32 << 14;
+	seg_desc.access |= usd->sd_gran << 15;
+
+	return (seg_desc);
+}
+
+/*
+ * Inject an exception with an error code that is a segment selector.
+ * The format of the error code is described in section 6.13, "Error Code",
+ * Intel SDM volume 3.
+ *
+ * Bit 0 (EXT) denotes whether the exception occurred during delivery
+ * of an external event like an interrupt.
+ *
+ * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
+ * in the IDT.
+ *
+ * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
+ */
+static void
+sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
+{
+	/*
+	 * Bit 2 from the selector is retained as-is in the error code.
+	 *
+	 * Bit 1 can be safely cleared because none of the selectors
+	 * encountered during task switch emulation refer to a task
+	 * gate in the IDT.
+	 *
+	 * Bit 0 is set depending on the value of 'ext'.
+	 */
+	sel &= ~0x3;
+	if (ext)
+		sel |= 0x1;
+	vm_inject_fault(ctx, vcpu, vector, 1, sel);
+}
+
+/*
+ * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
+ * and non-zero otherwise.
+ */
+static int
+desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
+{
+	uint64_t base;
+	uint32_t limit, access;
+	int error, reg;
+
+	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+	assert(error == 0);
+
+	if (reg == VM_REG_GUEST_LDTR) {
+		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
+			return (-1);
+	}
+
+	if (limit < SEL_LIMIT(sel))
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
+ * by the selector 'sel'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc, bool doread)
+{
+	struct iovec iov[2];
+	uint64_t base;
+	uint32_t limit, access;
+	int error, reg;
+
+	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+	assert(error == 0);
+	assert(limit >= SEL_LIMIT(sel));
+
+	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
+	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
+	if (error == 0) {
+		if (doread)
+			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
+		else
+			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
+	}
+	return (error);
+}
+
+static int
+desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
+}
+
+static int
+desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
+}
+
+/*
+ * Read the TSS descriptor referenced by 'sel' into 'desc'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+	struct vm_guest_paging sup_paging;
+	int error;
+
+	assert(!ISLDT(sel));
+	assert(IDXSEL(sel) != 0);
+
+	/* Fetch the new TSS descriptor */
+	if (desc_table_limit_check(ctx, vcpu, sel)) {
+		if (ts->reason == TSR_IRET)
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		else
+			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
+		return (1);
+	}
+
+	sup_paging = ts->paging;
+	sup_paging.cpl = 0;		/* implicit supervisor mode */
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
+	return (error);
+}
+
+static bool
+code_desc(int sd_type)
+{
+	/* code descriptor */
+	return ((sd_type & 0x18) == 0x18);
+}
+
+static bool
+stack_desc(int sd_type)
+{
+	/* writable data descriptor */
+	return ((sd_type & 0x1A) == 0x12);
+}
+
+static bool
+data_desc(int sd_type)
+{
+	/* data descriptor or a readable code descriptor */
+	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
+}
+
+static bool
+ldt_desc(int sd_type)
+{
+
+	return (sd_type == SDT_SYSLDT);
+}
+
+/*
+ * Validate the descriptor 'seg_desc' associated with 'segment'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    int segment, struct seg_desc *seg_desc)
+{
+	struct vm_guest_paging sup_paging;
+	struct user_segment_descriptor usd;
+	int error, idtvec;
+	int cpl, dpl, rpl;
+	uint16_t sel, cs;
+	bool ldtseg, codeseg, stackseg, dataseg, conforming;
+
+	ldtseg = codeseg = stackseg = dataseg = false;
+	switch (segment) {
+	case VM_REG_GUEST_LDTR:
+		ldtseg = true;
+		break;
+	case VM_REG_GUEST_CS:
+		codeseg = true;
+		break;
+	case VM_REG_GUEST_SS:
+		stackseg = true;
+		break;
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+		dataseg = true;
+		break;
+	default:
+		assert(0);
+	}
+
+	/* Get the segment selector */
+	sel = GETREG(ctx, vcpu, segment);
+
+	/* LDT selector must point into the GDT */
+	if (ldtseg && ISLDT(sel)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* Descriptor table limit check */
+	if (desc_table_limit_check(ctx, vcpu, sel)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* NULL selector */
+	if (IDXSEL(sel) == 0) {
+		/* Code and stack segment selectors cannot be NULL */
+		if (codeseg || stackseg) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+		seg_desc->base = 0;
+		seg_desc->limit = 0;
+		seg_desc->access = 0x10000;	/* unusable */
+		return (0);
+	}
+
+	/* Read the descriptor from the GDT/LDT */
+	sup_paging = ts->paging;
+	sup_paging.cpl = 0;	/* implicit supervisor mode */
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
+	if (error)
+		return (error);
+
+	/* Verify that the descriptor type is compatible with the segment */
+	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
+	    (codeseg && !code_desc(usd.sd_type)) ||
+	    (dataseg && !data_desc(usd.sd_type)) ||
+	    (stackseg && !stack_desc(usd.sd_type))) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* Segment must be marked present */
+	if (!usd.sd_p) {
+		if (ldtseg)
+			idtvec = IDT_TS;
+		else if (stackseg)
+			idtvec = IDT_SS;
+		else
+			idtvec = IDT_NP;
+		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
+		return (1);
+	}
+
+	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+	cpl = cs & SEL_RPL_MASK;
+	rpl = sel & SEL_RPL_MASK;
+	dpl = usd.sd_dpl;
+
+	if (stackseg && (rpl != cpl || dpl != cpl)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	if (codeseg) {
+		conforming = (usd.sd_type & 0x4) ? true : false;
+		if ((conforming && (cpl < dpl)) ||
+		    (!conforming && (cpl != dpl))) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+	}
+
+	if (dataseg) {
+		/*
+		 * A data segment is always non-conforming except when it's
+		 * descriptor is a readable, conforming code segment.
+		 */
+		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
+			conforming = true;
+		else
+			conforming = false;
+
+		if (!conforming && (rpl > dpl || cpl > dpl)) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+	}
+	*seg_desc = usd_to_seg_desc(&usd);
+	return (0);
+}
+
+static void
+tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
+    uint32_t eip, struct tss32 *tss, struct iovec *iov)
+{
+
+	/* General purpose registers */
+	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
+	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
+	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
+	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
+	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
+	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
+	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
+
+	/* Segment selectors */
+	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
+	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
+	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
+	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
+
+	/* eflags and eip */
+	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+	if (task_switch->reason == TSR_IRET)
+		tss->tss_eflags &= ~PSL_NT;
+	tss->tss_eip = eip;
+
+	/* Copy updated old TSS into guest memory */
+	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
+}
+
+static void
+update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
+{
+	int error;
+
+	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
+	assert(error == 0);
+}
+
+/*
+ * Update the vcpu registers to reflect the state of the new task.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
+{
+	struct seg_desc seg_desc, seg_desc2;
+	uint64_t *pdpte, maxphyaddr, reserved;
+	uint32_t eflags;
+	int error, i;
+	bool nested;
+
+	nested = false;
+	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
+		tss->tss_link = ot_sel;
+		nested = true;
+	}
+
+	eflags = tss->tss_eflags;
+	if (nested)
+		eflags |= PSL_NT;
+
+	/* LDTR */
+	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
+
+	/* PBDR */
+	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
+		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
+			/*
+			 * XXX Assuming 36-bit MAXPHYADDR.
+			 */
+			maxphyaddr = (1UL << 36) - 1;
+			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
+			for (i = 0; i < 4; i++) {
+				/* Check reserved bits if the PDPTE is valid */
+				if (!(pdpte[i] & 0x1))
+					continue;
+				/*
+				 * Bits 2:1, 8:5 and bits above the processor's
+				 * maximum physical address are reserved.
+				 */
+				reserved = ~maxphyaddr | 0x1E6;
+				if (pdpte[i] & reserved) {
+					vm_inject_gp(ctx, vcpu);
+					return (1);
+				}
+			}
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
+		}
+		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
+		ts->paging.cr3 = tss->tss_cr3;
+	}
+
+	/* eflags and eip */
+	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
+
+	/* General purpose registers */
+	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
+
+	/* Segment selectors */
+	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
+	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
+	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
+	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
+	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
+	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
+
+	/*
+	 * If this is a nested task then write out the new TSS to update
+	 * the previous link field.
+	 */
+	if (nested)
+		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
+
+	/* Validate segment descriptors */
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
+
+	/*
+	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
+	 *
+	 * The SS and CS attribute checks on VM-entry are inter-dependent so
+	 * we need to make sure that both segments are valid before updating
+	 * either of them. This ensures that the VMCS state can pass the
+	 * VM-entry checks so the guest can handle any exception injected
+	 * during task switch emulation.
+	 */
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
+	if (error)
+		return (error);
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
+	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
+
+	return (0);
+}
+
+/*
+ * Push an error code on the stack of the new task. This is needed if the
+ * task switch was triggered by a hardware exception that causes an error
+ * code to be saved (e.g. #PF).
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    int task_type, uint32_t errcode)
+{
+	struct iovec iov[2];
+	struct seg_desc seg_desc;
+	int stacksize, bytes, error;
+	uint64_t gla, cr0, rflags;
+	uint32_t esp;
+	uint16_t stacksel;
+
+	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+
+	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
+	    &seg_desc.limit, &seg_desc.access);
+	assert(error == 0);
+
+	/*
+	 * Section "Error Code" in the Intel SDM vol 3: the error code is
+	 * pushed on the stack as a doubleword or word (depending on the
+	 * default interrupt, trap or task gate size).
+	 */
+	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
+		bytes = 4;
+	else
+		bytes = 2;
+
+	/*
+	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
+	 * stack-segment descriptor determines the size of the stack
+	 * pointer outside of 64-bit mode.
+	 */
+	if (SEG_DESC_DEF32(seg_desc.access))
+		stacksize = 4;
+	else
+		stacksize = 2;
+
+	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+	esp -= bytes;
+
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
+	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
+		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
+		return (1);
+	}
+
+	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
+		vm_inject_ac(ctx, vcpu, 1);
+		return (1);
+	}
+
+	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
+	    iov, nitems(iov));
+	if (error)
+		return (error);
+
+	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
+	return (0);
+}
+
+/*
+ * Evaluate return value from helper functions and potentially return to
+ * the VM run loop.
+ *  0: success
+ * +1: an exception was injected into the guest vcpu
+ * -1: unrecoverable/programming error
+ */
+#define	CHKERR(x)							\
+	do {								\
+		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\
+		if ((x) == -1)						\
+			return (VMEXIT_ABORT);				\
+		else if ((x) == 1)					\
+			return (VMEXIT_CONTINUE);			\
+	} while (0)
+
+int
+vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	struct seg_desc nt;
+	struct tss32 oldtss, newtss;
+	struct vm_task_switch *task_switch;
+	struct vm_guest_paging *paging, sup_paging;
+	struct user_segment_descriptor nt_desc, ot_desc;
+	struct iovec nt_iov[2], ot_iov[2];
+	uint64_t cr0, ot_base;
+	uint32_t eip, ot_lim, access;
+	int error, ext, minlimit, nt_type, ot_type, vcpu;
+	enum task_switch_reason reason;
+	uint16_t nt_sel, ot_sel;
+
+	task_switch = &vmexit->u.task_switch;
+	nt_sel = task_switch->tsssel;
+	ext = vmexit->u.task_switch.ext;
+	reason = vmexit->u.task_switch.reason;
+	paging = &vmexit->u.task_switch.paging;
+	vcpu = *pvcpu;
+
+	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
+
+	/*
+	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
+	 * The following page table accesses are implicitly supervisor mode:
+	 * - accesses to GDT or LDT to load segment descriptors
+	 * - accesses to the task state segment during task switch
+	 */
+	sup_paging = *paging;
+	sup_paging.cpl = 0;	/* implicit supervisor mode */
+
+	/* Fetch the new TSS descriptor */
+	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
+	CHKERR(error);
+
+	nt = usd_to_seg_desc(&nt_desc);
+
+	/* Verify the type of the new TSS */
+	nt_type = SEG_DESC_TYPE(nt.access);
+	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
+	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/* TSS descriptor must have present bit set */
+	if (!SEG_DESC_PRESENT(nt.access)) {
+		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
+		goto done;
+	}
+
+	/*
+	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
+	 * 44 bytes for a 16-bit TSS.
+	 */
+	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
+		minlimit = 104 - 1;
+	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
+		minlimit = 44 - 1;
+	else
+		minlimit = 0;
+
+	assert(minlimit > 0);
+	if (nt.limit < minlimit) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/* TSS must be busy if task switch is due to IRET */
+	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/*
+	 * TSS must be available (not busy) if task switch reason is
+	 * CALL, JMP, exception or interrupt.
+	 */
+	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
+		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
+		goto done;
+	}
+
+	/* Fetch the new TSS */
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
+	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
+	CHKERR(error);
+	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
+
+	/* Get the old TSS selector from the guest's task register */
+	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
+	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
+		/*
+		 * This might happen if a task switch was attempted without
+		 * ever loading the task register with LTR. In this case the
+		 * TR would contain the values from power-on:
+		 * (sel = 0, base = 0, limit = 0xffff).
+		 */
+		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
+		goto done;
+	}
+
+	/* Get the old TSS base and limit from the guest's task register */
+	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
+	    &access);
+	assert(error == 0);
+	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
+	ot_type = SEG_DESC_TYPE(access);
+	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
+
+	/* Fetch the old TSS descriptor */
+	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
+	CHKERR(error);
+
+	/* Get the old TSS */
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
+	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
+	CHKERR(error);
+	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
+
+	/*
+	 * Clear the busy bit in the old TSS descriptor if the task switch
+	 * due to an IRET or JMP instruction.
+	 */
+	if (reason == TSR_IRET || reason == TSR_JMP) {
+		ot_desc.sd_type &= ~0x2;
+		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
+		    &ot_desc);
+		CHKERR(error);
+	}
+
+	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
+		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
+		return (VMEXIT_ABORT);
+	}
+
+	/* Save processor state in old TSS */
+	eip = vmexit->rip + vmexit->inst_length;
+	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
+
+	/*
+	 * If the task switch was triggered for any reason other than IRET
+	 * then set the busy bit in the new TSS descriptor.
+	 */
+	if (reason != TSR_IRET) {
+		nt_desc.sd_type |= 0x2;
+		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
+		    &nt_desc);
+		CHKERR(error);
+	}
+
+	/* Update task register to point at the new TSS */
+	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
+
+	/* Update the hidden descriptor state of the task register */
+	nt = usd_to_seg_desc(&nt_desc);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
+
+	/* Set CR0.TS */
+	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
+
+	/*
+	 * We are now committed to the task switch. Any exceptions encountered
+	 * after this point will be handled in the context of the new task and
+	 * the saved instruction pointer will belong to the new task.
+	 */
+	vmexit->rip = newtss.tss_eip;
+	vmexit->inst_length = 0;
+
+	/* Load processor state from new TSS */
+	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
+	CHKERR(error);
+
+	/*
+	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
+	 * caused an error code to be generated, this error code is copied
+	 * to the stack of the new task.
+	 */
+	if (task_switch->errcode_valid) {
+		assert(task_switch->ext);
+		assert(task_switch->reason == TSR_IDT_GATE);
+		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
+		    task_switch->errcode);
+		CHKERR(error);
+	}
+
+	/*
+	 * Treatment of virtual-NMI blocking if NMI is delivered through
+	 * a task gate.
+	 *
+	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
+	 * If the virtual NMIs VM-execution control is 1, VM entry injects
+	 * an NMI, and delivery of the NMI causes a task switch that causes
+	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
+	 * commences.
+	 *
+	 * Thus, virtual-NMI blocking is in effect at the time of the task
+	 * switch VM exit.
+	 */
+
+	/*
+	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
+	 *
+	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
+	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
+	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
+	 *
+	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
+	 * VM exit.
+	 */
+
+	/*
+	 * If the task switch was triggered by an event delivered through
+	 * the IDT then extinguish the pending event from the vcpu's
+	 * exitintinfo.
+	 */
+	if (task_switch->reason == TSR_IDT_GATE) {
+		error = vm_set_intinfo(ctx, vcpu, 0);
+		assert(error == 0);
+	}
+
+	/*
+	 * XXX should inject debug exception if 'T' bit is 1
+	 */
+done:
+	return (VMEXIT_CONTINUE);
+}
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index 4e58dd6..1f27300 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -437,7 +437,7 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail)
 	if (used_all_avail &&
 	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
 		intr = 1;
-	else if (vs->vs_flags & VIRTIO_EVENT_IDX) {
+	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
 		event_idx = VQ_USED_EVENT_IDX(vq);
 		/*
 		 * This calculation is per docs and the kernel
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index 01b5f7b..1f29dfa 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -352,7 +352,7 @@ struct virtio_consts {
 					/* called to read config regs */
 	int	(*vc_cfgwrite)(void *, int, int, uint32_t);
 					/* called to write config regs */
-	uint32_t vc_hv_caps;		/* hypervisor-provided capabilities */
+	uint64_t vc_hv_caps;		/* hypervisor-provided capabilities */
 };
 
 /*
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index e77f0d7..b6006b7 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -195,7 +195,8 @@ usage(void)
 	"       [--force-reset]\n"
 	"       [--force-poweroff]\n"
 	"       [--get-active-cpus]\n"
-	"       [--get-suspended-cpus]\n",
+	"       [--get-suspended-cpus]\n"
+	"       [--get-intinfo]\n",
 	progname);
 	exit(1);
 }
@@ -205,6 +206,7 @@ static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
+static int get_intinfo;
 static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
 static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
@@ -412,6 +414,37 @@ print_cpus(const char *banner, const cpuset_t *cpus)
 	printf("\n");
 }
 
+static void
+print_intinfo(const char *banner, uint64_t info)
+{
+	int type;
+
+	printf("%s:\t", banner);
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		switch (type) {
+		case VM_INTINFO_HWINTR:
+			printf("extint");
+			break;
+		case VM_INTINFO_NMI:
+			printf("nmi");
+			break;
+		case VM_INTINFO_SWINTR:
+			printf("swint");
+			break;
+		default:
+			printf("exception");
+			break;
+		}
+		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
+		if (info & VM_INTINFO_DEL_ERRCODE)
+			printf(" errcode %#x", (u_int)(info >> 32));
+	} else {
+		printf("n/a");
+	}
+	printf("\n");
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -420,7 +453,7 @@ main(int argc, char *argv[])
 	vm_paddr_t gpa, gpa_pmap;
 	size_t len;
 	struct vm_exit vmexit;
-	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
+	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte, info[2];
 	struct vmctx *ctx;
 	int wired;
 	cpuset_t cpus;
@@ -595,6 +628,7 @@ main(int argc, char *argv[])
 		{ "force-poweroff", NO_ARG,	&force_poweroff, 1 },
 		{ "get-active-cpus", NO_ARG,	&get_active_cpus, 1 },
 		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 1 },
+		{ "get-intinfo", NO_ARG,	&get_intinfo,	1 },
 		{ NULL,		0,		NULL,		0 }
 	};
 
@@ -1566,6 +1600,14 @@ main(int argc, char *argv[])
 			print_cpus("suspended cpus", &cpus);
 	}
 
+	if (!error && (get_intinfo || get_all)) {
+		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
+		if (!error) {
+			print_intinfo("pending", info[0]);
+			print_intinfo("current", info[1]);
+		}
+	}
+
 	if (!error && run) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		assert(error == 0);
diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8
index 3a300cc..0bdf151 100644
--- a/usr.sbin/bhyveload/bhyveload.8
+++ b/usr.sbin/bhyveload/bhyveload.8
@@ -35,11 +35,11 @@
 guest inside a bhyve virtual machine
 .Sh SYNOPSIS
 .Nm
-.Op Fl m Ar mem-size
+.Op Fl c Ar cons-dev
 .Op Fl d Ar disk-path
-.Op Fl h Ar host-path
 .Op Fl e Ar name=value
-.Op Fl c Ar cons-dev
+.Op Fl h Ar host-path
+.Op Fl m Ar mem-size
 .Ar vmname
 .Sh DESCRIPTION
 .Nm
@@ -62,6 +62,32 @@ and will be created if it does not already exist.
 .Sh OPTIONS
 The following options are available:
 .Bl -tag -width indent
+.It Fl c Ar cons-dev
+.Ar cons-dev
+is a
+.Xr tty 4
+device to use for
+.Nm
+terminal I/O.
+.Pp
+The text string "stdio" is also accepted and selects the use of
+unbuffered standard I/O. This is the default value.
+.It Fl d Ar disk-path
+The
+.Ar disk-path
+is the pathname of the guest's boot disk image.
+.It Fl e Ar name=value
+Set the FreeBSD loader environment variable
+.Ar name
+to
+.Ar value .
+.Pp
+The option may be used more than once to set more than one environment
+variable.
+.It Fl h Ar host-path
+The
+.Ar host-path
+is the directory at the top of the guest's boot filesystem.
 .It Fl m Ar mem-size Xo
 .Sm off
 .Op Cm K | k | M | m | G | g | T | t
@@ -85,32 +111,6 @@ respectively.
 The default value of
 .Ar mem-size
 is 256M.
-.It Fl d Ar disk-path
-The
-.Ar disk-path
-is the pathname of the guest's boot disk image.
-.It Fl h Ar host-path
-The
-.Ar host-path
-is the directory at the top of the guest's boot filesystem.
-.It Fl e Ar name=value
-Set the FreeBSD loader environment variable
-.Ar name
-to
-.Ar value .
-.Pp
-The option may be used more than once to set more than one environment
-variable.
-.It Fl c Ar cons-dev
-.Ar cons-dev
-is a
-.Xr tty 4
-device to use for
-.Nm
-terminal I/O.
-.Pp
-The text string "stdio" is also accepted and selects the use of
-unbuffered standard I/O. This is the default value.
 .El
 .Sh EXAMPLES
 To create a virtual machine named
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
index ff6b269..eaf71a8 100644
--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -629,8 +629,8 @@ usage(void)
 {
 
 	fprintf(stderr,
-	    "usage: %s [-m mem-size] [-d <disk-path>] [-h <host-path>]\n"
-	    "       %*s [-e <name=value>] [-c <console-device>] <vmname>\n",
+	    "usage: %s [-c <console-device>] [-d <disk-path>] [-e <name=value>]\n"
+	    "       %*s [-h <host-path>] [-m mem-size] <vmname>\n",
 	    progname,
 	    (int)strlen(progname), "");
 	exit(1);
author	grehan <grehan@FreeBSD.org>	2014-08-19 01:20:24 +0000
committer	grehan <grehan@FreeBSD.org>	2014-08-19 01:20:24 +0000
commit	5d455a50f5226eff9d453943eb0b714689989802 (patch)
tree	6aaf5296bf1aa8632ab9beb5fe4a61727ef7ab68
parent	4791bac9b41a259cdb9821e381df8837926220e2 (diff)
download	FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.zip FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.tar.gz