MFC r267921, r267934, r267949, r267959, r267966, r268202, r268276,

r268427, r268428, r268521, r268638, r268639, r268701, r268777, r268889, r268922, r269008, r269042, r269043, r269080, r269094, r269108, r269109, r269281, r269317, r269700, r269896, r269962, r269989. Catch bhyve up to CURRENT. Lightly tested with FreeBSD i386/amd64, Linux i386/amd64, and OpenBSD/amd64. Still resolving an issue with OpenBSD/i386. Many thanks to jhb@ for all the hard work on the prior MFCs ! r267921 - support the "mov r/m8, imm8" instruction r267934 - document options r267949 - set DMI vers/date to fixed values r267959 - doc: sort cmd flags r267966 - EPT misconf post-mortem info r268202 - use correct flag for event index r268276 - 64-bit virtio capability api r268427 - invalidate guest TLB when cr3 is updated, needed for TSS r268428 - identify vcpu's operating mode r268521 - use correct offset in guest logical-to-linear translation r268638 - chs value r268639 - chs fake values r268701 - instr emul operand/address size override prefix support r268777 - emulation for legacy x86 task switching r268889 - nested exception support r268922 - fix INVARIANTS build r269008 - emulate instructions found in the OpenBSD/i386 5.5 kernel r269042 - fix fault injection r269043 - Reduce VMEXIT_RESTARTs in task_switch.c r269080 - fix issues in PUSH emulation r269094 - simplify return values from the inout handlers r269108 - don't return -1 from the push emulation handler r269109 - avoid permanent sleep in vm_handle_hlt() r269281 - list VT-x features in base kernel dmesg r269317 - Mark AHCI fatal errors as not completed r269700 - Support PCI extended config space in bhyve r269896 - Minor cleanup r269962 - use max guest memory when creating IOMMU domain r269989 - fix interrupt mode names
author: grehan <grehan@FreeBSD.org> 2014-08-19 01:20:24 +0000
committer: grehan <grehan@FreeBSD.org> 2014-08-19 01:20:24 +0000
commit: 5d455a50f5226eff9d453943eb0b714689989802 (patch)
tree: 6aaf5296bf1aa8632ab9beb5fe4a61727ef7ab68 /sys/amd64
parent: 4791bac9b41a259cdb9821e381df8837926220e2 (diff)
download: FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.zip
FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.tar.gz
14 files changed, 1529 insertions, 334 deletions
diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 957120c..74be82c 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 
+#include <amd64/vmm/intel/vmx_controls.h>
 #include <x86/isa/icu.h>
 
 /* XXX - should be in header file: */
@@ -73,6 +74,7 @@ static u_int find_cpu_vendor_id(void);
 static void print_AMD_info(void);
 static void print_AMD_assoc(int i);
 static void print_via_padlock_info(void);
+static void print_vmx_info(void);
 
 int	cpu_class;
 char machine[] = "amd64";
@@ -428,6 +430,9 @@ printcpuinfo(void)
 			if (via_feature_rng != 0 || via_feature_xcrypt != 0)
 				print_via_padlock_info();
 
+			if (cpu_feature2 & CPUID2_VMX)
+				print_vmx_info();
+
 			if ((cpu_feature & CPUID_HTT) &&
 			    cpu_vendor_id == CPU_VENDOR_AMD)
 				cpu_feature &= ~CPUID_HTT;
@@ -722,3 +727,197 @@ print_via_padlock_info(void)
 	"\015RSA"		/* PMM */
 	);
 }
+
+static uint32_t
+vmx_settable(uint64_t basic, int msr, int true_msr)
+{
+	uint64_t val;
+
+	if (basic & (1UL << 55))
+		val = rdmsr(true_msr);
+	else
+		val = rdmsr(msr);
+
+	/* Just report the controls that can be set to 1. */
+	return (val >> 32);
+}
+
+static void
+print_vmx_info(void)
+{
+	uint64_t basic, msr;
+	uint32_t entry, exit, mask, pin, proc, proc2;
+	int comma;
+
+	printf("\n  VT-x: ");
+	msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if (!(msr & IA32_FEATURE_CONTROL_VMX_EN))
+		printf("(disabled in BIOS) ");
+	basic = rdmsr(MSR_VMX_BASIC);
+	pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS,
+	    MSR_VMX_TRUE_PINBASED_CTLS);
+	proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS,
+	    MSR_VMX_TRUE_PROCBASED_CTLS);
+	if (proc & PROCBASED_SECONDARY_CONTROLS)
+		proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2,
+		    MSR_VMX_PROCBASED_CTLS2);
+	else
+		proc2 = 0;
+	exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS);
+	entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS);
+
+	if (!bootverbose) {
+		comma = 0;
+		if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT &&
+		    entry & VM_ENTRY_LOAD_PAT) {
+			printf("%sPAT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_HLT_EXITING) {
+			printf("%sHLT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_MTF) {
+			printf("%sMTF", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_PAUSE_EXITING) {
+			printf("%sPAUSE", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_ENABLE_EPT) {
+			printf("%sEPT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) {
+			printf("%sUG", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_ENABLE_VPID) {
+			printf("%sVPID", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_USE_TPR_SHADOW &&
+		    proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES &&
+		    proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE &&
+		    proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION &&
+		    proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) {
+			printf("%sVID", comma ? "," : "");
+			comma = 1;
+			if (pin & PINBASED_POSTED_INTERRUPT)
+				printf(",PostIntr");
+		}
+		return;
+	}
+
+	mask = basic >> 32;
+	printf("Basic Features=0x%b", mask,
+	"\020"
+	"\02132PA"		/* 32-bit physical addresses */
+	"\022SMM"		/* SMM dual-monitor */
+	"\027INS/OUTS"		/* VM-exit info for INS and OUTS */
+	"\030TRUE"		/* TRUE_CTLS MSRs */
+	);
+	printf("\n        Pin-Based Controls=0x%b", pin,
+	"\020"
+	"\001ExtINT"		/* External-interrupt exiting */
+	"\004NMI"		/* NMI exiting */
+	"\006VNMI"		/* Virtual NMIs */
+	"\007PreTmr"		/* Activate VMX-preemption timer */
+	"\010PostIntr"		/* Process posted interrupts */
+	);
+	printf("\n        Primary Processor Controls=0x%b", proc,
+	"\020"
+	"\003INTWIN"		/* Interrupt-window exiting */
+	"\004TSCOff"		/* Use TSC offsetting */
+	"\010HLT"		/* HLT exiting */
+	"\012INVLPG"		/* INVLPG exiting */
+	"\013MWAIT"		/* MWAIT exiting */
+	"\014RDPMC"		/* RDPMC exiting */
+	"\015RDTSC"		/* RDTSC exiting */
+	"\020CR3-LD"		/* CR3-load exiting */
+	"\021CR3-ST"		/* CR3-store exiting */
+	"\024CR8-LD"		/* CR8-load exiting */
+	"\025CR8-ST"		/* CR8-store exiting */
+	"\026TPR"		/* Use TPR shadow */
+	"\027NMIWIN"		/* NMI-window exiting */
+	"\030MOV-DR"		/* MOV-DR exiting */
+	"\031IO"		/* Unconditional I/O exiting */
+	"\032IOmap"		/* Use I/O bitmaps */
+	"\034MTF"		/* Monitor trap flag */
+	"\035MSRmap"		/* Use MSR bitmaps */
+	"\036MONITOR"		/* MONITOR exiting */
+	"\037PAUSE"		/* PAUSE exiting */
+	);
+	if (proc & PROCBASED_SECONDARY_CONTROLS)
+		printf("\n        Secondary Processor Controls=0x%b", proc2,
+		"\020"
+		"\001APIC"		/* Virtualize APIC accesses */
+		"\002EPT"		/* Enable EPT */
+		"\003DT"		/* Descriptor-table exiting */
+		"\004RDTSCP"		/* Enable RDTSCP */
+		"\005x2APIC"		/* Virtualize x2APIC mode */
+		"\006VPID"		/* Enable VPID */
+		"\007WBINVD"		/* WBINVD exiting */
+		"\010UG"		/* Unrestricted guest */
+		"\011APIC-reg"		/* APIC-register virtualization */
+		"\012VID"		/* Virtual-interrupt delivery */
+		"\013PAUSE-loop"	/* PAUSE-loop exiting */
+		"\014RDRAND"		/* RDRAND exiting */
+		"\015INVPCID"		/* Enable INVPCID */
+		"\016VMFUNC"		/* Enable VM functions */
+		"\017VMCS"		/* VMCS shadowing */
+		"\020EPT#VE"		/* EPT-violation #VE */
+		"\021XSAVES"		/* Enable XSAVES/XRSTORS */
+		);
+	printf("\n        Exit Controls=0x%b", mask,
+	"\020"
+	"\003DR"		/* Save debug controls */
+				/* Ignore Host address-space size */
+	"\015PERF"		/* Load MSR_PERF_GLOBAL_CTRL */
+	"\020AckInt"		/* Acknowledge interrupt on exit */
+	"\023PAT-SV"		/* Save MSR_PAT */
+	"\024PAT-LD"		/* Load MSR_PAT */
+	"\025EFER-SV"		/* Save MSR_EFER */
+	"\026EFER-LD"		/* Load MSR_EFER */
+	"\027PTMR-SV"		/* Save VMX-preemption timer value */
+	);
+	printf("\n        Entry Controls=0x%b", mask,
+	"\020"
+	"\003DR"		/* Save debug controls */
+				/* Ignore IA-32e mode guest */
+				/* Ignore Entry to SMM */
+				/* Ignore Deactivate dual-monitor treatment */
+	"\016PERF"		/* Load MSR_PERF_GLOBAL_CTRL */
+	"\017PAT"		/* Load MSR_PAT */
+	"\020EFER"		/* Load MSR_EFER */
+	);
+	if (proc & PROCBASED_SECONDARY_CONTROLS &&
+	    (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) {
+		msr = rdmsr(MSR_VMX_EPT_VPID_CAP);
+		mask = msr;
+		printf("\n        EPT Features=0x%b", mask,
+		"\020"
+		"\001XO"		/* Execute-only translations */
+		"\007PW4"		/* Page-walk length of 4 */
+		"\011UC"		/* EPT paging-structure mem can be UC */
+		"\017WB"		/* EPT paging-structure mem can be WB */
+		"\0212M"		/* EPT PDE can map a 2-Mbyte page */
+		"\0221G"		/* EPT PDPTE can map a 1-Gbyte page */
+		"\025INVEPT"		/* INVEPT is supported */
+		"\026AD"		/* Accessed and dirty flags for EPT */
+		"\032single"		/* INVEPT single-context type */
+		"\033all"		/* INVEPT all-context type */
+		);
+		mask = msr >> 32;
+		printf("\n        VPID Features=0x%b", mask,
+		"\020"
+		"\001INVVPID"		/* INVVPID is supported */
+		"\011individual"	/* INVVPID individual-address type */
+		"\012single"		/* INVVPID single-context type */
+		"\013all"		/* INVVPID all-context type */
+		 /* INVVPID single-context-retaining-globals type */
+		"\014single-globals"	
+		);
+	}
+}
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 5a359e9..63a9b3f 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -29,11 +29,14 @@
 #ifndef _VMM_H_
 #define	_VMM_H_
 
+#include <x86/segments.h>
+
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
+	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
@@ -75,6 +78,10 @@ enum vm_reg_name {
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
+	VM_REG_GUEST_PDPTE0,
+	VM_REG_GUEST_PDPTE1,
+	VM_REG_GUEST_PDPTE2,
+	VM_REG_GUEST_PDPTE3,
 	VM_REG_LAST
 };
 
@@ -84,6 +91,16 @@ enum x2apic_state {
 	X2APIC_STATE_LAST
 };
 
+#define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
+#define	VM_INTINFO_DEL_ERRCODE	0x800
+#define	VM_INTINFO_RSVD		0x7ffff000
+#define	VM_INTINFO_VALID	0x80000000
+#define	VM_INTINFO_TYPE		0x700
+#define	VM_INTINFO_HWINTR	(0 << 8)
+#define	VM_INTINFO_NMI		(2 << 8)
+#define	VM_INTINFO_HWEXCEPTION	(3 << 8)
+#define	VM_INTINFO_SWINTR	(4 << 8)
+
 #ifdef _KERNEL
 
 #define	VM_MAX_NAMELEN	32
@@ -99,6 +116,7 @@ struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
+struct vm_guest_paging;
 struct pmap;
 
 typedef int	(*vmm_init_func_t)(int ipinum);
@@ -252,6 +270,14 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
+#ifdef _SYS_PROC_H_
+static int __inline
+vcpu_should_yield(struct vm *vm, int vcpu)
+{
+	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
+}
+#endif
+
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
@@ -274,21 +300,63 @@ struct vatpit *vm_atpit(struct vm *vm);
 int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
 
 /*
- * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
- * exception is pending and also updates 'vme'. The pending exception is
- * cleared when this function returns.
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
- * This function should only be called in the context of the thread that is
- * executing this vcpu.
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
  */
-int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
-void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
-void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
-void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
+struct vm_copyinfo {
+	uint64_t	gpa;
+	size_t		len;
+	void		*hva;
+	void		*cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout. 
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
@@ -322,13 +390,16 @@ struct seg_desc {
 	uint32_t	limit;
 	uint32_t	access;
 };
-#define	SEG_DESC_TYPE(desc)		((desc)->access & 0x001f)
-#define	SEG_DESC_PRESENT(desc)		((desc)->access & 0x0080)
-#define	SEG_DESC_DEF32(desc)		((desc)->access & 0x4000)
-#define	SEG_DESC_GRANULARITY(desc)	((desc)->access & 0x8000)
-#define	SEG_DESC_UNUSABLE(desc)		((desc)->access & 0x10000)
+#define	SEG_DESC_TYPE(access)		((access) & 0x001f)
+#define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
+#define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
+#define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
+#define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
+#define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
+	CPU_MODE_REAL,
+	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
@@ -364,11 +435,14 @@ struct vie {
 	uint8_t		num_valid;		/* size of the instruction */
 	uint8_t		num_processed;
 
+	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
-			rex_present:1;
+			rex_present:1,
+			opsize_override:1,	/* Operand size override */
+			addrsize_override:1;	/* Address size override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
@@ -410,6 +484,7 @@ enum vm_exitcode {
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
+	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MAX
 };
 
@@ -434,6 +509,22 @@ struct vm_inout_str {
 	struct seg_desc seg_desc;
 };
 
+enum task_switch_reason {
+	TSR_CALL,
+	TSR_IRET,
+	TSR_JMP,
+	TSR_IDT_GATE,	/* task gate in IDT */
+};
+
+struct vm_task_switch {
+	uint16_t	tsssel;		/* new TSS selector */
+	int		ext;		/* task switch due to external event */
+	uint32_t	errcode;
+	int		errcode_valid;	/* push 'errcode' on the new stack */
+	enum task_switch_reason reason;
+	struct vm_guest_paging paging;
+};
+
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
@@ -448,6 +539,7 @@ struct vm_exit {
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
+			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
@@ -487,7 +579,38 @@ struct vm_exit {
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
+		struct vm_task_switch task_switch;
 	} u;
 };
 
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+    int errcode);
+
+static void __inline
+vm_inject_ud(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static void __inline
+vm_inject_gp(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static void __inline
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static void __inline
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
 #endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 9b3b00d..e4d839e 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -189,6 +189,12 @@ struct vm_cpuset {
 #define	VM_ACTIVE_CPUS		0
 #define	VM_SUSPENDED_CPUS	1
 
+struct vm_intinfo {
+	int		vcpuid;
+	uint64_t	info1;
+	uint64_t	info2;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@@ -211,6 +217,8 @@ enum {
 	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
 
 	/* interrupt injection */
+	IOCNUM_GET_INTINFO = 28,
+	IOCNUM_SET_INTINFO = 29,
 	IOCNUM_INJECT_EXCEPTION = 30,
 	IOCNUM_LAPIC_IRQ = 31,
 	IOCNUM_INJECT_NMI = 32,
@@ -324,4 +332,8 @@ enum {
 	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
 #define	VM_GET_CPUS	\
 	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define	VM_SET_INTINFO	\
+	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
+#define	VM_GET_INTINFO	\
+	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
 #endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index e4c408b..bbd3d88 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
  * s
  */
 int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
-			    mem_region_read_t mrr, mem_region_write_t mrw,
-			    void *mrarg);
+    struct vm_guest_paging *paging, mem_region_read_t mrr,
+    mem_region_write_t mrw, void *mrarg);
 
 int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
     uint64_t val, int size);
@@ -108,7 +108,7 @@ void vie_init(struct vie *vie);
  */
 #define	VIE_INVALID_GLA		(1UL << 63)	/* a non-canonical address */
 int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
-			   enum vm_cpu_mode cpu_mode, struct vie *vie);
+			   enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
 #endif	/* _KERNEL */
 
 #endif	/* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index cc97d95..51e5c2c 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -103,6 +103,14 @@ vmcs_field_encoding(int ident)
 		return (VMCS_GUEST_LDTR_SELECTOR);
 	case VM_REG_GUEST_EFER:
 		return (VMCS_GUEST_IA32_EFER);
+	case VM_REG_GUEST_PDPTE0:
+		return (VMCS_GUEST_PDPTE0);
+	case VM_REG_GUEST_PDPTE1:
+		return (VMCS_GUEST_PDPTE1);
+	case VM_REG_GUEST_PDPTE2:
+		return (VMCS_GUEST_PDPTE2);
+	case VM_REG_GUEST_PDPTE3:
+		return (VMCS_GUEST_PDPTE3);
 	default:
 		return (-1);
 	}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 657d5b0..4e9557c 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t val)
 #define	VMCS_INTR_T_HWINTR	(0 << 8)
 #define	VMCS_INTR_T_NMI		(2 << 8)
 #define	VMCS_INTR_T_HWEXCEPTION	(3 << 8)
+#define	VMCS_INTR_T_SWINTR	(4 << 8)
+#define	VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define	VMCS_INTR_T_SWEXCEPTION	(6 << 8)
 #define	VMCS_INTR_DEL_ERRCODE	(1 << 11)
 
 /*
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 2cbb159..b2c5702 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -149,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
-static int vmx_no_patmsr;
-
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
@@ -158,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 /*
  * Optional capabilities
  */
+static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
+
+static int vmx_patmsr;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
+    "PAT MSR saved and restored in VCMS");
+
 static int cap_halt_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
+    "HLT triggers a VM-exit");
+
 static int cap_pause_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
+    0, "PAUSE triggers a VM-exit");
+
 static int cap_unrestricted_guest;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
+    &cap_unrestricted_guest, 0, "Unrestricted guests");
+
 static int cap_monitor_trap;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
+    &cap_monitor_trap, 0, "Monitor trap flag");
+
 static int cap_invpcid;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
+    0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec;
@@ -618,6 +636,7 @@ vmx_init(int ipinum)
 	}
 
 	/* Check support for VM-exit controls */
+	vmx_patmsr = 1;
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
@@ -637,12 +656,12 @@ vmx_init(int ipinum)
 			if (bootverbose)
 				printf("vmm: PAT MSR access not supported\n");
 			guest_msr_valid(MSR_PAT);
-			vmx_no_patmsr = 1;
+			vmx_patmsr = 0;
 		}
 	}
 
 	/* Check support for VM-entry controls */
-	if (!vmx_no_patmsr) {
+	if (vmx_patmsr) {
 		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
 				       MSR_VMX_TRUE_ENTRY_CTLS,
 				       VM_ENTRY_CTLS_ONE_SETTING,
@@ -918,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * MSR_PAT save/restore support, leave access disabled so accesses
 	 * will be trapped.
 	 */
-	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+	if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
 		panic("vmx_vminit: error setting guest pat msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
@@ -974,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
-		vmx->state[i].lastcpu = -1;
+		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
@@ -1047,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
+static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
-static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+/*
+ * Invalidate guest mappings identified by its vpid from the TLB.
+ */
+static __inline void
+vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
-	if (vmxstate->lastcpu == curcpu)
+	if (vmxstate->vpid == 0)
 		return;
 
-	vmxstate->lastcpu = curcpu;
-
-	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+	if (!running) {
+		/*
+		 * Set the 'lastcpu' to an invalid host cpu.
+		 *
+		 * This will invalidate TLB entries tagged with the vcpu's
+		 * vpid the next time it runs via vmx_set_pcpu_defaults().
+		 */
+		vmxstate->lastcpu = NOCPU;
+		return;
+	}
 
-	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
-	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
-	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
+	    "critical section", __func__, vcpu));
 
 	/*
-	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
@@ -1081,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
-	if (vmxstate->vpid != 0) {
-		if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
-			invvpid_desc._res1 = 0;
-			invvpid_desc._res2 = 0;
-			invvpid_desc.vpid = vmxstate->vpid;
-			invvpid_desc.linear_addr = 0;
-			invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
-		} else {
-			/*
-			 * The invvpid can be skipped if an invept is going to
-			 * be performed before entering the guest. The invept
-			 * will invalidate combined mappings tagged with
-			 * 'vmx->eptp' for all vpids.
-			 */
-			vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
-		}
+	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+		invvpid_desc._res1 = 0;
+		invvpid_desc._res2 = 0;
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid_desc.linear_addr = 0;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
+	} else {
+		/*
+		 * The invvpid can be skipped if an invept is going to
+		 * be performed before entering the guest. The invept
+		 * will invalidate combined mappings tagged with
+		 * 'vmx->eptp' for all vpids.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+{
+	struct vmxstate *vmxstate;
+
+	vmxstate = &vmx->state[vcpu];
+	if (vmxstate->lastcpu == curcpu)
+		return;
+
+	vmxstate->lastcpu = curcpu;
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	vmx_invvpid(vmx, vcpu, pmap, 1);
+}
+
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
@@ -1183,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 {
-	struct vm_exception exc;
 	int vector, need_nmi_exiting, extint_pending;
-	uint64_t rflags;
+	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
-	if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
-		KASSERT(exc.vector >= 0 && exc.vector < 32,
-		    ("%s: invalid exception vector %d", __func__, exc.vector));
+	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
-		     "pending exception %d: %#x", __func__, exc.vector, info));
+		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
-		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
-		if (exc.error_code_valid) {
-			info |= VMCS_INTR_DEL_ERRCODE;
-			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+		info = entryinfo;
+		vector = info & 0xff;
+		if (vector == IDT_BP || vector == IDT_OF) {
+			/*
+			 * VT-x requires #BP and #OF to be injected as software
+			 * exceptions.
+			 */
+			info &= ~VMCS_INTR_T_MASK;
+			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
+
+		if (info & VMCS_INTR_DEL_ERRCODE)
+			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
@@ -1379,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
+static void
+vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+	uint32_t gi;
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
+	    ("NMI blocking is not in effect %#x", gi));
+}
+
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -1659,11 +1724,19 @@ vmx_cpl(void)
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
+	uint32_t csar;
 
-	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
-		return (CPU_MODE_64BIT);
-	else
-		return (CPU_MODE_COMPATIBILITY);
+	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		if (csar & 0x2000)
+			return (CPU_MODE_64BIT);	/* CS.L = 1 */
+		else
+			return (CPU_MODE_COMPATIBILITY);
+	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
+		return (CPU_MODE_PROTECTED);
+	} else {
+		return (CPU_MODE_REAL);
+	}
 }
 
 static enum vm_paging_mode
@@ -1757,10 +1830,25 @@ vmx_paging_info(struct vm_guest_paging *paging)
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
+	struct vm_guest_paging *paging;
+	uint32_t csar;
+	
+	paging = &vmexit->u.inst_emul.paging;
+
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
-	vmx_paging_info(&vmexit->u.inst_emul.paging);
+	vmx_paging_info(paging);
+	switch (paging->cpu_mode) {
+	case CPU_MODE_PROTECTED:
+	case CPU_MODE_COMPATIBILITY:
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
+		break;
+	default:
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
+	}
 }
 
 static int
@@ -1969,6 +2057,26 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 	return (UNHANDLED);
 }
 
+static enum task_switch_reason
+vmx_task_switch_reason(uint64_t qual)
+{
+	int reason;
+
+	reason = (qual >> 30) & 0x3;
+	switch (reason) {
+	case 0:
+		return (TSR_CALL);
+	case 1:
+		return (TSR_IRET);
+	case 2:
+		return (TSR_JMP);
+	case 3:
+		return (TSR_IDT_GATE);
+	default:
+		panic("%s: invalid reason %d", __func__, reason);
+	}
+}
+
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -1976,9 +2084,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
+	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
-	uint32_t reason;
-	uint64_t qual, gpa;
+	uint32_t intr_type, reason;
+	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
@@ -1994,46 +2103,99 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
-	 * VM exits that could be triggered during event injection on the
-	 * previous VM entry need to be handled specially by re-injecting
-	 * the event.
+	 * VM exits that can be triggered during event delivery need to
+	 * be handled specially by re-injecting the event if the IDT
+	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
-	switch (reason) {
-	case EXIT_REASON_EPT_FAULT:
-	case EXIT_REASON_EPT_MISCONFIG:
-	case EXIT_REASON_APIC_ACCESS:
-	case EXIT_REASON_TASK_SWITCH:
-	case EXIT_REASON_EXCEPTION:
-		idtvec_info = vmcs_idt_vectoring_info();
-		if (idtvec_info & VMCS_IDT_VEC_VALID) {
-			idtvec_info &= ~(1 << 12); /* clear undefined bit */
-			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
-			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
-				idtvec_err = vmcs_idt_vectoring_err();
-				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
-				    idtvec_err);
-			}
-			/*
-			 * If 'virtual NMIs' are being used and the VM-exit
-			 * happened while injecting an NMI during the previous
-			 * VM-entry, then clear "blocking by NMI" in the Guest
-			 * Interruptibility-state.
-			 */
-			if ((idtvec_info & VMCS_INTR_T_MASK) ==
-			    VMCS_INTR_T_NMI) {
-				 vmx_clear_nmi_blocking(vmx, vcpu);
-			}
+	idtvec_info = vmcs_idt_vectoring_info();
+	if (idtvec_info & VMCS_IDT_VEC_VALID) {
+		idtvec_info &= ~(1 << 12); /* clear undefined bit */
+		exitintinfo = idtvec_info;
+		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+			idtvec_err = vmcs_idt_vectoring_err();
+			exitintinfo |= (uint64_t)idtvec_err << 32;
+		}
+		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
+		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
+		    __func__, error));
+
+		/*
+		 * If 'virtual NMIs' are being used and the VM-exit
+		 * happened while injecting an NMI during the previous
+		 * VM-entry, then clear "blocking by NMI" in the
+		 * Guest Interruptibility-State so the NMI can be
+		 * reinjected on the subsequent VM-entry.
+		 *
+		 * However, if the NMI was being delivered through a task
+		 * gate, then the new task must start execution with NMIs
+		 * blocked so don't clear NMI blocking in this case.
+		 */
+		intr_type = idtvec_info & VMCS_INTR_T_MASK;
+		if (intr_type == VMCS_INTR_T_NMI) {
+			if (reason != EXIT_REASON_TASK_SWITCH)
+				vmx_clear_nmi_blocking(vmx, vcpu);
+			else
+				vmx_assert_nmi_blocking(vmx, vcpu);
+		}
+
+		/*
+		 * Update VM-entry instruction length if the event being
+		 * delivered was a software interrupt or software exception.
+		 */
+		if (intr_type == VMCS_INTR_T_SWINTR ||
+		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
+		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
-	default:
-		idtvec_info = 0;
-		break;
 	}
 
 	switch (reason) {
+	case EXIT_REASON_TASK_SWITCH:
+		ts = &vmexit->u.task_switch;
+		ts->tsssel = qual & 0xffff;
+		ts->reason = vmx_task_switch_reason(qual);
+		ts->ext = 0;
+		ts->errcode_valid = 0;
+		vmx_paging_info(&ts->paging);
+		/*
+		 * If the task switch was due to a CALL, JMP, IRET, software
+		 * interrupt (INT n) or software exception (INT3, INTO),
+		 * then the saved %rip references the instruction that caused
+		 * the task switch. The instruction length field in the VMCS
+		 * is valid in this case.
+		 *
+		 * In all other cases (e.g., NMI, hardware exception) the
+		 * saved %rip is one that would have been saved in the old TSS
+		 * had the task switch completed normally so the instruction
+		 * length field is not needed in this case and is explicitly
+		 * set to 0.
+		 */
+		if (ts->reason == TSR_IDT_GATE) {
+			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
+			    ("invalid idtvec_info %#x for IDT task switch",
+			    idtvec_info));
+			intr_type = idtvec_info & VMCS_INTR_T_MASK;
+			if (intr_type != VMCS_INTR_T_SWINTR &&
+			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
+			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
+				/* Task switch triggered by external event */
+				ts->ext = 1;
+				vmexit->inst_length = 0;
+				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+					ts->errcode_valid = 1;
+					ts->errcode = vmcs_idt_vectoring_err();
+				}
+			}
+		}
+		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
+		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
+		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
+		    ts->ext ? "external" : "internal",
+		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
+		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		switch (qual & 0xf) {
@@ -2179,6 +2341,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
+		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_info & 0xff) != IDT_DF &&
@@ -2396,6 +2559,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
+		vmx_inject_interrupts(vmx, vcpu, vlapic);
+
+		/*
+		 * Check for vcpu suspension after injecting events because
+		 * vmx_inject_interrupts() can suspend the vcpu due to a
+		 * triple fault.
+		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
@@ -2408,7 +2578,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 			break;
 		}
 
-		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
 			vmx_astpending_trace(vmx, vcpu, vmexit->rip);
@@ -2416,7 +2586,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 			break;
 		}
 
-		vmx_inject_interrupts(vmx, vcpu, vlapic);
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 
@@ -2584,6 +2753,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
+	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
@@ -2621,6 +2791,18 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
+
+		if (reg == VM_REG_GUEST_CR3) {
+			/*
+			 * Invalidate the guest vcpu's TLB mappings to emulate
+			 * the behavior of updating %cr3.
+			 *
+			 * XXX the processor retains global mappings when %cr3
+			 * is updated but vmx_invvpid() does not.
+			 */
+			pmap = vmx->ctx[vcpu].pmap;
+			vmx_invvpid(vmx, vcpu, pmap, running);
+		}
 	}
 
 	return (error);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index 2aba63c..a3428db 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 
 #include <machine/cpufunc.h>
+#include <machine/specialreg.h>
 
 #include "vmx_msr.h"
 
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
index e6379a9..340b0f7 100644
--- a/sys/amd64/vmm/intel/vmx_msr.h
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -29,29 +29,6 @@
 #ifndef _VMX_MSR_H_
 #define	_VMX_MSR_H_
 
-#define	MSR_VMX_BASIC			0x480
-#define	MSR_VMX_EPT_VPID_CAP		0x48C
-
-#define	MSR_VMX_PROCBASED_CTLS		0x482
-#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E
-
-#define	MSR_VMX_PINBASED_CTLS		0x481
-#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D
-
-#define	MSR_VMX_PROCBASED_CTLS2		0x48B
-
-#define	MSR_VMX_EXIT_CTLS		0x483
-#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f
-
-#define	MSR_VMX_ENTRY_CTLS		0x484
-#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490
-
-#define	MSR_VMX_CR0_FIXED0		0x486
-#define	MSR_VMX_CR0_FIXED1		0x487
-
-#define	MSR_VMX_CR4_FIXED0		0x488
-#define	MSR_VMX_CR4_FIXED1		0x489
-
 uint32_t vmx_revision(void);
 
 int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
index ca76ea8..f5ef71b 100644
--- a/sys/amd64/vmm/intel/vtd.c
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -452,6 +452,11 @@ vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
 	ptpindex = 0;
 	ptpshift = 0;
 
+	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
+	    gpa, len));
+	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
+	    "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
+
 	if (gpa & PAGE_MASK)
 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
 
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index ee6fc84..38fc458 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -195,26 +195,29 @@ vatpic_notify_intr(struct vatpic *vatpic)
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
+		 * From Section 3.6.2, "Interrupt Modes", in the
+		 * MPtable Specification, Version 1.4
+		 *
 		 * PIC interrupts are routed to both the Local APIC
 		 * and the I/O APIC to support operation in 1 of 3
 		 * modes.
 		 *
 		 * 1. Legacy PIC Mode: the PIC effectively bypasses
-		 * all APIC components.  In mode '1' the local APIC is
+		 * all APIC components.  In this mode the local APIC is
 		 * disabled and LINT0 is reconfigured as INTR to
 		 * deliver the PIC interrupt directly to the CPU.
 		 *
 		 * 2. Virtual Wire Mode: the APIC is treated as a
 		 * virtual wire which delivers interrupts from the PIC
-		 * to the CPU.  In mode '2' LINT0 is programmed as
+		 * to the CPU.  In this mode LINT0 is programmed as
 		 * ExtINT to indicate that the PIC is the source of
 		 * the interrupt.
 		 *
-		 * 3. Symmetric I/O Mode: PIC interrupts are fielded
-		 * by the I/O APIC and delivered to the appropriate
-		 * CPU.  In mode '3' the I/O APIC input 0 is
-		 * programmed as ExtINT to indicate that the PIC is
-		 * the source of the interrupt.
+		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
+		 * fielded by the I/O APIC and delivered to the appropriate
+		 * CPU.  In this mode the I/O APIC input 0 is programmed
+		 * as ExtINT to indicate that the PIC is the source of the
+		 * interrupt.
 		 */
 		atpic->intr_raised = true;
 		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index c2a9fd1..fa0200e 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -97,6 +97,7 @@ struct vcpu {
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
+	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	struct vm_exception exception;	/* (x) exception collateral */
@@ -242,6 +243,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
@@ -571,6 +573,21 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	return (0);
 }
 
+static vm_paddr_t
+vm_maxmem(struct vm *vm)
+{
+	int i;
+	vm_paddr_t gpa, maxmem;
+
+	maxmem = 0;
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
+		if (gpa > maxmem)
+			maxmem = gpa;
+	}
+	return (maxmem);
+}
+
 static void
 vm_gpa_unwire(struct vm *vm)
 {
@@ -708,7 +725,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
-		maxaddr = vmm_mem_maxaddr();
+		maxaddr = vm_maxmem(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 
 		error = vm_gpa_wire(vm);
@@ -1104,6 +1121,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 			}
 		}
 
+		/* Don't go to sleep if the vcpu thread needs to yield */
+		if (vcpu_should_yield(vm, vcpuid))
+			break;
+
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
@@ -1127,7 +1148,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 
 		t = ticks;
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
-		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
+		/*
+		 * XXX msleep_spin() cannot be interrupted by signals so
+		 * wake up periodically to check pending signals.
+		 */
+		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
@@ -1191,15 +1216,18 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
-	int error;
+	enum vm_cpu_mode cpu_mode;
+	int cs_d, error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
+	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
+	cpu_mode = paging->cpu_mode;
 
 	vie_init(vie);
 
@@ -1213,7 +1241,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	else if (error != 0)
 		panic("%s: vmm_fetch_instruction error %d", __func__, error);
 
-	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 		return (EFAULT);
 
 	/* return to userland unless this is an in-kernel emulated device */
@@ -1231,8 +1259,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 		return (0);
 	}
 
-	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
-	    retu);
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+	    mread, mwrite, retu);
 
 	return (error);
 }
@@ -1456,6 +1484,202 @@ restart:
 }
 
 int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+	struct vcpu *vcpu;
+	int type, vector;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		vector = info & 0xff;
+		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+			return (EINVAL);
+		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+			return (EINVAL);
+		if (info & VM_INTINFO_RSVD)
+			return (EINVAL);
+	} else {
+		info = 0;
+	}
+	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+	vcpu->exitintinfo = info;
+	return (0);
+}
+
+enum exc_class {
+	EXC_BENIGN,
+	EXC_CONTRIBUTORY,
+	EXC_PAGEFAULT
+};
+
+#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+	int type, vector;
+
+	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+	type = info & VM_INTINFO_TYPE;
+	vector = info & 0xff;
+
+	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+	switch (type) {
+	case VM_INTINFO_HWINTR:
+	case VM_INTINFO_SWINTR:
+	case VM_INTINFO_NMI:
+		return (EXC_BENIGN);
+	default:
+		/*
+		 * Hardware exception.
+		 *
+		 * SVM and VT-x use identical type values to represent NMI,
+		 * hardware interrupt and software interrupt.
+		 *
+		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
+		 * for exceptions except #BP and #OF. #BP and #OF use a type
+		 * value of '5' or '6'. Therefore we don't check for explicit
+		 * values of 'type' to classify 'intinfo' into a hardware
+		 * exception.
+		 */
+		break;
+	}
+
+	switch (vector) {
+	case IDT_PF:
+	case IDT_VE:
+		return (EXC_PAGEFAULT);
+	case IDT_DE:
+	case IDT_TS:
+	case IDT_NP:
+	case IDT_SS:
+	case IDT_GP:
+		return (EXC_CONTRIBUTORY);
+	default:
+		return (EXC_BENIGN);
+	}
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+    uint64_t *retinfo)
+{
+	enum exc_class exc1, exc2;
+	int type1, vector1;
+
+	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+
+	/*
+	 * If an exception occurs while attempting to call the double-fault
+	 * handler the processor enters shutdown mode (aka triple fault).
+	 */
+	type1 = info1 & VM_INTINFO_TYPE;
+	vector1 = info1 & 0xff;
+	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+		    info1, info2);
+		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+		*retinfo = 0;
+		return (0);
+	}
+
+	/*
+	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+	 */
+	exc1 = exception_class(info1);
+	exc2 = exception_class(info2);
+	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+		/* Convert nested fault into a double fault. */
+		*retinfo = IDT_DF;
+		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		*retinfo |= VM_INTINFO_DEL_ERRCODE;
+	} else {
+		/* Handle exceptions serially */
+		*retinfo = info2;
+	}
+	return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+	uint64_t info = 0;
+
+	if (vcpu->exception_pending) {
+		info = vcpu->exception.vector & 0xff;
+		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		if (vcpu->exception.error_code_valid) {
+			info |= VM_INTINFO_DEL_ERRCODE;
+			info |= (uint64_t)vcpu->exception.error_code << 32;
+		}
+	}
+	return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+	struct vcpu *vcpu;
+	uint64_t info1, info2;
+	int valid;
+
+	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	info1 = vcpu->exitintinfo;
+	vcpu->exitintinfo = 0;
+
+	info2 = 0;
+	if (vcpu->exception_pending) {
+		info2 = vcpu_exception_intinfo(vcpu);
+		vcpu->exception_pending = 0;
+		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+		    vcpu->exception.vector, info2);
+	}
+
+	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+	} else if (info1 & VM_INTINFO_VALID) {
+		*retinfo = info1;
+		valid = 1;
+	} else if (info2 & VM_INTINFO_VALID) {
+		*retinfo = info2;
+		valid = 1;
+	} else {
+		valid = 0;
+	}
+
+	if (valid) {
+		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+	}
+
+	return (valid);
+}
+
+int
+vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	*info1 = vcpu->exitintinfo;
+	*info2 = vcpu_exception_intinfo(vcpu);
+	return (0);
+}
+
+int
 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
 	struct vcpu *vcpu;
@@ -1466,6 +1690,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	if (exception->vector < 0 || exception->vector >= 32)
 		return (EINVAL);
 
+	/*
+	 * A double fault exception should never be injected directly into
+	 * the guest. It is a derived exception that results from specific
+	 * combinations of nested faults.
+	 */
+	if (exception->vector == IDT_DF)
+		return (EINVAL);
+
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
@@ -1481,32 +1713,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	return (0);
 }
 
-int
-vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
-{
-	struct vcpu *vcpu;
-	int pending;
-
-	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
-
-	vcpu = &vm->vcpu[vcpuid];
-	pending = vcpu->exception_pending;
-	if (pending) {
-		vcpu->exception_pending = 0;
-		*exception = vcpu->exception;
-		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
-		    exception->vector);
-	}
-	return (pending);
-}
-
-static void
-vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+    int errcode)
 {
+	struct vm_exception exception;
 	struct vm_exit *vmexit;
+	struct vm *vm;
 	int error;
 
-	error = vm_inject_exception(vm, vcpuid, exception);
+	vm = vmarg;
+
+	exception.vector = vector;
+	exception.error_code = errcode;
+	exception.error_code_valid = errcode_valid;
+	error = vm_inject_exception(vm, vcpuid, &exception);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 
 	/*
@@ -1521,45 +1742,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
 }
 
 void
-vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
-	struct vm_exception pf = {
-		.vector = IDT_PF,
-		.error_code_valid = 1,
-		.error_code = error_code
-	};
+	struct vm *vm;
 	int error;
 
+	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
-	vm_inject_fault(vm, vcpuid, &pf);
-}
-
-void
-vm_inject_gp(struct vm *vm, int vcpuid)
-{
-	struct vm_exception gpf = {
-		.vector = IDT_GP,
-		.error_code_valid = 1,
-		.error_code = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &gpf);
-}
-
-void
-vm_inject_ud(struct vm *vm, int vcpuid)
-{
-	struct vm_exception udf = {
-		.vector = IDT_UD,
-		.error_code_valid = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &udf);
+	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
@@ -1993,6 +2188,97 @@ vm_segment_name(int seg)
 	return (seg_names[seg]);
 }
 
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int idx;
+
+	for (idx = 0; idx < num_copyinfo; idx++) {
+		if (copyinfo[idx].cookie != NULL)
+			vm_gpa_release(copyinfo[idx].cookie);
+	}
+	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int error, idx, nused;
+	size_t n, off, remaining;
+	void *hva, *cookie;
+	uint64_t gpa;
+
+	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+	nused = 0;
+	remaining = len;
+	while (remaining > 0) {
+		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+		if (error)
+			return (error);
+		off = gpa & PAGE_MASK;
+		n = min(remaining, PAGE_SIZE - off);
+		copyinfo[nused].gpa = gpa;
+		copyinfo[nused].len = n;
+		remaining -= n;
+		gla += n;
+		nused++;
+	}
+
+	for (idx = 0; idx < nused; idx++) {
+		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+		    prot, &cookie);
+		if (hva == NULL)
+			break;
+		copyinfo[idx].hva = hva;
+		copyinfo[idx].cookie = cookie;
+	}
+
+	if (idx != nused) {
+		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+    size_t len)
+{
+	char *dst;
+	int idx;
+	
+	dst = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		dst += copyinfo[idx].len;
+		idx++;
+	}
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len)
+{
+	const char *src;
+	int idx;
+
+	src = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		src += copyinfo[idx].len;
+		idx++;
+	}
+}
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f3e31a3..a85109e 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -173,6 +173,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_gla2gpa *gg;
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
+	struct vm_intinfo *vmii;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -199,6 +200,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
 	case VM_ACTIVATE_CPU:
+	case VM_SET_INTINFO:
+	case VM_GET_INTINFO:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -470,6 +473,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
+	case VM_SET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
+		break;
+	case VM_GET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
+		    &vmii->info2);
+		break;
 	default:
 		error = ENOTTY;
 		break;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 921deb5..a65b125 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #else	/* !_KERNEL */
 #include <sys/types.h>
 #include <sys/errno.h>
+#include <sys/_iovec.h>
 
 #include <machine/vmm.h>
 
@@ -65,18 +66,26 @@ enum {
 	VIE_OP_TYPE_AND,
 	VIE_OP_TYPE_OR,
 	VIE_OP_TYPE_TWO_BYTE,
+	VIE_OP_TYPE_PUSH,
+	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_LAST
 };
 
 /* struct vie_op.op_flags */
-#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
-#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
+#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
+#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
+#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
+#define	VIE_OP_F_NO_MODRM	(1 << 3)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xB7] = {
+		.op_byte = 0xB7,
+		.op_type = VIE_OP_TYPE_MOVZX,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -88,6 +97,10 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
+	[0x3B] = {
+		.op_byte = 0x3B,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -104,6 +117,22 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x8B,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
+	[0xA1] = {
+		.op_byte = 0xA1,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xA3] = {
+		.op_byte = 0xA3,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xC6] = {
+		/* XXX Group 11 extended opcode - not just MOV */
+		.op_byte = 0xC6,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0xC7] = {
 		.op_byte = 0xC7,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -125,6 +154,11 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_OR,
 		.op_flags = VIE_OP_F_IMM8,
 	},
+	[0xFF] = {
+		/* XXX Group 5 extended opcode - not just PUSH */
+		.op_byte = 0xFF,
+		.op_type = VIE_OP_TYPE_PUSH,
+	}
 };
 
 /* struct vie.mod */
@@ -175,18 +209,15 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
 	return (error);
 }
 
-static int
-vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+static void
+vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
 {
-	uint64_t val;
-	int error, rshift;
-	enum vm_reg_name reg;
-
-	rshift = 0;
-	reg = gpr_map[vie->reg];
+	*lhbr = 0;
+	*reg = gpr_map[vie->reg];
 
 	/*
-	 * 64-bit mode imposes limitations on accessing legacy byte registers.
+	 * 64-bit mode imposes limitations on accessing legacy high byte
+	 * registers (lhbr).
 	 *
 	 * The legacy high-byte registers cannot be addressed if the REX
 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
@@ -198,17 +229,56 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
 	 */
 	if (!vie->rex_present) {
 		if (vie->reg & 0x4) {
-			/*
-			 * Obtain the value of %ah by reading %rax and shifting
-			 * right by 8 bits (same for %bh, %ch and %dh).
-			 */
-			rshift = 8;
-			reg = gpr_map[vie->reg & 0x3];
+			*lhbr = 1;
+			*reg = gpr_map[vie->reg & 0x3];
 		}
 	}
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+	uint64_t val;
+	int error, lhbr;
+	enum vm_reg_name reg;
 
+	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &val);
-	*rval = val >> rshift;
+
+	/*
+	 * To obtain the value of a legacy high byte register shift the
+	 * base register right by 8 bits (%ah = %rax >> 8).
+	 */
+	if (lhbr)
+		*rval = val >> 8;
+	else
+		*rval = val;
+	return (error);
+}
+
+static int
+vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
+{
+	uint64_t origval, val, mask;
+	int error, lhbr;
+	enum vm_reg_name reg;
+
+	vie_calc_bytereg(vie, &reg, &lhbr);
+	error = vm_get_register(vm, vcpuid, reg, &origval);
+	if (error == 0) {
+		val = byte;
+		mask = 0xff;
+		if (lhbr) {
+			/*
+			 * Shift left by 8 to store 'byte' in a legacy high
+			 * byte register.
+			 */
+			val <<= 8;
+			mask <<= 8;
+		}
+		val |= origval & ~mask;
+		error = vm_set_register(vm, vcpuid, reg, val);
+	}
 	return (error);
 }
 
@@ -242,16 +312,52 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 }
 
 /*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- *   - default address size is 64-bits
- *   - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
+ * Return the status flags that would result from doing (x - y).
  */
+static u_long
+getcc16(uint16_t x, uint16_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc32(uint32_t x, uint32_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc64(uint64_t x, uint64_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+	    ("getcc: invalid operand size %d", opsize));
+
+	if (opsize == 2)
+		return (getcc16(x, y));
+	else if (opsize == 4)
+		return (getcc32(x, y));
+	else
+		return (getcc64(x, y));
+}
+
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -261,7 +367,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	uint8_t byte;
 	uint64_t val;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -271,7 +377,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * 88/r:	mov r/m8, r8
 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
 		 */
-		size = 1;
+		size = 1;	/* override for byte operation */
 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
 		if (error == 0)
 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
@@ -279,11 +385,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	case 0x89:
 		/*
 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m16, r16
 		 * 89/r:	mov r/m32, r32
 		 * REX.W + 89/r	mov r/m64, r64
 		 */
-		if (vie->rex_w)
-			size = 8;
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val);
 		if (error == 0) {
@@ -292,38 +397,72 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		}
 		break;
 	case 0x8A:
+		/*
+		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8A/r:	mov r8, r/m8
+		 * REX + 8A/r:	mov r8, r/m8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0)
+			error = vie_write_bytereg(vm, vcpuid, vie, val);
+		break;
 	case 0x8B:
 		/*
 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
-		 * 8A/r:	mov r/m8, r8
-		 * REX + 8A/r:	mov r/m8, r8
+		 * 8B/r:	mov r16, r/m16
 		 * 8B/r:	mov r32, r/m32
 		 * REX.W 8B/r:	mov r64, r/m64
 		 */
-		if (vie->op.op_byte == 0x8A)
-			size = 1;
-		else if (vie->rex_w)
-			size = 8;
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = gpr_map[vie->reg];
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
+	case 0xA1:
+		/*
+		 * MOV from seg:moffset to AX/EAX/RAX
+		 * A1:		mov AX, moffs16
+		 * A1:		mov EAX, moffs32
+		 * REX.W + A1:	mov RAX, moffs64
+		 */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = VM_REG_GUEST_RAX;
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xA3:
+		/*
+		 * MOV from AX/EAX/RAX to seg:moffset
+		 * A3:		mov moffs16, AX
+		 * A3:		mov moffs32, EAX 
+		 * REX.W + A3:	mov moffs64, RAX
+		 */
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0xC6:
+		/*
+		 * MOV from imm8 to mem (ModRM:r/m)
+		 * C6/0		mov r/m8, imm8
+		 * REX + C6/0	mov r/m8, imm8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
+		break;
 	case 0xC7:
 		/*
-		 * MOV from imm32 to mem (ModRM:r/m)
+		 * MOV from imm16/imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m16, imm16
 		 * C7/0		mov r/m32, imm32
 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
 		 */
-		val = vie->immediate;		/* already sign-extended */
-
-		if (vie->rex_w)
-			size = 8;
-
-		if (size != 8)
-			val &= size2mask[size];
-
+		val = vie->immediate & size2mask[size];
 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		break;
 	default:
@@ -333,17 +472,6 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
-/*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- *   - default address size is 64-bits
- *   - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
- */
 static int
 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	     mem_region_read_t memread, mem_region_write_t memwrite,
@@ -353,7 +481,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	enum vm_reg_name reg;
 	uint64_t val;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -362,8 +490,9 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * MOV and zero extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
-		 * 0F B6/r		movzx r/m8, r32
-		 * REX.W + 0F B6/r	movzx r/m8, r64
+		 * 0F B6/r		movzx r16, r/m8
+		 * 0F B6/r		movzx r32, r/m8
+		 * REX.W + 0F B6/r	movzx r64, r/m8
 		 */
 
 		/* get the first operand */
@@ -374,19 +503,39 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
-		if (vie->rex_w)
-			size = 8;
+		/* zero-extend byte */
+		val = (uint8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
+	case 0xB7:
+		/*
+		 * MOV and zero extend word from mem (ModRM:r/m) to
+		 * reg (ModRM:reg).
+		 *
+		 * 0F B7/r		movzx r32, r/m16
+		 * REX.W + 0F B7/r	movzx r64, r/m16
+		 */
+		error = memread(vm, vcpuid, gpa, &val, 2, arg);
+		if (error)
+			return (error);
+
+		reg = gpr_map[vie->reg];
+
+		/* zero-extend word */
+		val = (uint16_t)val;
+
+		error = vie_update_register(vm, vcpuid, reg, val, size);
+		break;
 	case 0xBE:
 		/*
 		 * MOV and sign extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
-		 * 0F BE/r		movsx r/m8, r32
-		 * REX.W + 0F BE/r	movsx r/m8, r64
+		 * 0F BE/r		movsx r16, r/m8
+		 * 0F BE/r		movsx r32, r/m8
+		 * REX.W + 0F BE/r	movsx r64, r/m8
 		 */
 
 		/* get the first operand */
@@ -397,9 +546,6 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
-		if (vie->rex_w)
-			size = 8;
-
 		/* sign extend byte */
 		val = (int8_t)val;
 
@@ -420,7 +566,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	enum vm_reg_name reg;
 	uint64_t val1, val2;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -429,11 +575,10 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
 		 * result in reg.
 		 *
+		 * 23/r		and r16, r/m16
 		 * 23/r		and r32, r/m32
 		 * REX.W + 23/r	and r64, r/m64
 		 */
-		if (vie->rex_w)
-			size = 8;
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
@@ -455,8 +600,9 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * AND mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 81/          and r/m32, imm32
-		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
+		 * 81 /4		and r/m16, imm16
+		 * 81 /4		and r/m32, imm32
+		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
 		 *
 		 * Currently, only the AND operation of the 0x81 opcode
 		 * is implemented (ModRM:reg = b100).
@@ -464,9 +610,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		if ((vie->reg & 7) != 4)
 			break;
 
-		if (vie->rex_w)
-			size = 8;
-		
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
@@ -492,7 +635,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	int error, size;
 	uint64_t val1;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -501,8 +644,9 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 83/          OR r/m32, imm8 sign-extended to 32
-		 * REX.W + 83/  OR r/m64, imm8 sign-extended to 64
+		 * 83 /1		OR r/m16, imm8 sign-extended to 16
+		 * 83 /1		OR r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
 		 *
 		 * Currently, only the OR operation of the 0x83 opcode
 		 * is implemented (ModRM:reg = b001).
@@ -510,9 +654,6 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		if ((vie->reg & 7) != 1)
 			break;
 
-		if (vie->rex_w)
-			size = 8;
-		
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
@@ -531,10 +672,167 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t op1, op2, rflags, rflags2;
+	enum vm_reg_name reg;
+
+	size = vie->opsize;
+	switch (vie->op.op_byte) {
+	case 0x3B:
+		/*
+		 * 3B/r		CMP r16, r/m16
+		 * 3B/r		CMP r32, r/m32
+		 * REX.W + 3B/r	CMP r64, r/m64
+		 *
+		 * Compare first operand (reg) with second operand (r/m) and
+		 * set status flags in EFLAGS register. The comparison is
+		 * performed by subtracting the second operand from the first
+		 * operand and then setting the status flags.
+		 */
+
+		/* Get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &op1);
+		if (error)
+			return (error);
+
+		/* Get the second operand */
+		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		if (error)
+			return (error);
+
+		break;
+	default:
+		return (EINVAL);
+	}
+	rflags2 = getcc(size, op1, op2);
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	struct seg_desc ss_desc;
+	uint64_t cr0, rflags, rsp, stack_gla, val;
+	int error, size, stackaddrsize;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * PUSH is part of the group 5 extended opcodes and is identified
+	 * by ModRM:reg = b110.
+	 */
+	if ((vie->reg & 7) != 6)
+		return (EINVAL);
+
+	size = vie->opsize;
+	/*
+	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+	 */
+	if (paging->cpu_mode == CPU_MODE_REAL) {
+		stackaddrsize = 2;
+	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
+		 * - Stack pointer size is always 64-bits.
+		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
+		 * - 16-bit PUSH/POP is supported by using the operand size
+		 *   override prefix (66H).
+		 */
+		stackaddrsize = 8;
+		size = vie->opsize_override ? 2 : 8;
+	} else {
+		/*
+		 * In protected or compability mode the 'B' flag in the
+		 * stack-segment descriptor determines the size of the
+		 * stack pointer.
+		 */
+		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+		    __func__, error));
+		if (SEG_DESC_DEF32(ss_desc.access))
+			stackaddrsize = 4;
+		else
+			stackaddrsize = 2;
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+
+	rsp -= size;
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (0);
+	}
+
+	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
+	    copyinfo, nitems(copyinfo));
+	if (error == -1) {
+		/*
+		 * XXX cannot return a negative error value here because it
+		 * ends up being the return value of the VM_RUN() ioctl and
+		 * is interpreted as a pseudo-error (for e.g. ERESTART).
+		 */
+		return (EFAULT);
+	} else if (error == 1) {
+		/* Resume guest execution to handle page fault */
+		return (0);
+	}
+
+	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+	if (error == 0) {
+		vm_copyout(vm, vcpuid, &val, copyinfo, size);
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+		    stackaddrsize);
+		KASSERT(error == 0, ("error %d updating rsp", error));
+	}
+#ifdef _KERNEL
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+	return (error);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
-			mem_region_read_t memread, mem_region_write_t memwrite,
-			void *memarg)
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
@@ -542,6 +840,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_PUSH:
+		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_CMP:
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_MOV:
 		error = emulate_mov(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
@@ -636,7 +942,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		 * then the descriptor is unusable and attempting to use
 		 * it results in a #GP(0).
 		 */
-		if (SEG_DESC_UNUSABLE(desc))
+		if (SEG_DESC_UNUSABLE(desc->access))
 			return (-1);
 
 		/* 
@@ -645,13 +951,13 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		 * descriptor that is not present. If this was the case then
 		 * it would have been checked before the VM-exit.
 		 */
-		KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x",
-		    seg, desc->access));
+		KASSERT(SEG_DESC_PRESENT(desc->access),
+		    ("segment %d not present: %#x", seg, desc->access));
 
 		/*
 		 * The descriptor type must indicate a code/data segment.
 		 */
-		type = SEG_DESC_TYPE(desc);
+		type = SEG_DESC_TYPE(desc->access);
 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 		    "descriptor type %#x", seg, type));
 
@@ -680,7 +986,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		if ((type & 0xC) == 0x4) {
 			/* expand-down data segment */
 			low_limit = desc->limit + 1;
-			high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff;
+			high_limit = SEG_DESC_DEF32(desc->access) ?
+			    0xffffffff : 0xffff;
 		} else {
 			/* code segment or expand-up data segment */
 			low_limit = 0;
@@ -947,45 +1254,24 @@ fault:
 }
 
 int
-vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t rip, int inst_length, struct vie *vie)
 {
-	int n, error, prot;
-	uint64_t gpa, off;
-	void *hpa, *cookie;
-
-	/*
-	 * XXX cache previously fetched instructions using 'rip' as the tag
-	 */
+	struct vm_copyinfo copyinfo[2];
+	int error, prot;
 
-	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
 
-	/* Copy the instruction into 'vie' */
-	while (vie->num_valid < inst_length) {
-		error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
-		if (error)
-			return (error);
-
-		off = gpa & PAGE_MASK;
-		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
-
-		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
-			break;
-
-		bcopy(hpa, &vie->inst[vie->num_valid], n);
-
-		vm_gpa_release(cookie);
-
-		rip += n;
-		vie->num_valid += n;
+	prot = PROT_READ | PROT_EXEC;
+	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		vie->num_valid = inst_length;
 	}
-
-	if (vie->num_valid == inst_length)
-		return (0);
-	else
-		return (-1);
+	return (error);
 }
 
 static int
@@ -1007,24 +1293,65 @@ vie_advance(struct vie *vie)
 }
 
 static int
-decode_rex(struct vie *vie)
+decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
 	uint8_t x;
 
-	if (vie_peek(vie, &x))
-		return (-1);
+	while (1) {
+		if (vie_peek(vie, &x))
+			return (-1);
 
-	if (x >= 0x40 && x <= 0x4F) {
-		vie->rex_present = 1;
+		if (x == 0x66)
+			vie->opsize_override = 1;
+		else if (x == 0x67)
+			vie->addrsize_override = 1;
+		else
+			break;
+
+		vie_advance(vie);
+	}
 
+	/*
+	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
+	 * - Only one REX prefix is allowed per instruction.
+	 * - The REX prefix must immediately precede the opcode byte or the
+	 *   escape opcode byte.
+	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
+	 *   the mandatory prefix must come before the REX prefix.
+	 */
+	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
+		vie->rex_present = 1;
 		vie->rex_w = x & 0x8 ? 1 : 0;
 		vie->rex_r = x & 0x4 ? 1 : 0;
 		vie->rex_x = x & 0x2 ? 1 : 0;
 		vie->rex_b = x & 0x1 ? 1 : 0;
-
 		vie_advance(vie);
 	}
 
+	/*
+	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
+	 */
+	if (cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * Default address size is 64-bits and default operand size
+		 * is 32-bits.
+		 */
+		vie->addrsize = vie->addrsize_override ? 4 : 8;
+		if (vie->rex_w)
+			vie->opsize = 8;
+		else if (vie->opsize_override)
+			vie->opsize = 2;
+		else
+			vie->opsize = 4;
+	} else if (cs_d) {
+		/* Default address and operand sizes are 32-bits */
+		vie->addrsize = vie->addrsize_override ? 2 : 4;
+		vie->opsize = vie->opsize_override ? 2 : 4;
+	} else {
+		/* Default address and operand sizes are 16-bits */
+		vie->addrsize = vie->addrsize_override ? 4 : 2;
+		vie->opsize = vie->opsize_override ? 4 : 2;
+	}
 	return (0);
 }
 
@@ -1071,6 +1398,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 {
 	uint8_t x;
 
+	if (cpu_mode == CPU_MODE_REAL)
+		return (-1);
+
+	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+		return (0);
+
 	if (vie_peek(vie, &x))
 		return (-1);
 
@@ -1249,20 +1582,32 @@ decode_immediate(struct vie *vie)
 	union {
 		char	buf[4];
 		int8_t	signed8;
+		int16_t	signed16;
 		int32_t	signed32;
 	} u;
 
 	/* Figure out immediate operand size (if any) */
-	if (vie->op.op_flags & VIE_OP_F_IMM)
-		vie->imm_bytes = 4;
-	else if (vie->op.op_flags & VIE_OP_F_IMM8)
+	if (vie->op.op_flags & VIE_OP_F_IMM) {
+		/*
+		 * Section 2.2.1.5 "Immediates", Intel SDM:
+		 * In 64-bit mode the typical size of immediate operands
+		 * remains 32-bits. When the operand size if 64-bits, the
+		 * processor sign-extends all immediates to 64-bits prior
+		 * to their use.
+		 */
+		if (vie->opsize == 4 || vie->opsize == 8)
+			vie->imm_bytes = 4;
+		else
+			vie->imm_bytes = 2;
+	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 		vie->imm_bytes = 1;
+	}
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
-	if (n != 1 && n != 4)
-		panic("decode_immediate: invalid imm_bytes %d", n);
+	KASSERT(n == 1 || n == 2 || n == 4,
+	    ("%s: invalid number of immediate bytes: %d", __func__, n));
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
@@ -1271,12 +1616,47 @@ decode_immediate(struct vie *vie)
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
-	
+
+	/* sign-extend the immediate value before use */
 	if (n == 1)
-		vie->immediate = u.signed8;		/* sign-extended */
+		vie->immediate = u.signed8;
+	else if (n == 2)
+		vie->immediate = u.signed16;
 	else
-		vie->immediate = u.signed32;		/* sign-extended */
+		vie->immediate = u.signed32;
+
+	return (0);
+}
+
+static int
+decode_moffset(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[8];
+		uint64_t u64;
+	} u;
+
+	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+		return (0);
+
+	/*
+	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+	 * The memory offset size follows the address-size of the instruction.
+	 */
+	n = vie->addrsize;
+	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+	u.u64 = 0;
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
 
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	vie->displacement = u.u64;
 	return (0);
 }
 
@@ -1301,7 +1681,7 @@ static int
 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 {
 	int error;
-	uint64_t base, idx;
+	uint64_t base, idx, gla2;
 
 	/* Skip 'gla' verification */
 	if (gla == VIE_INVALID_GLA)
@@ -1334,11 +1714,14 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 		}
 	}
 
-	if (base + vie->scale * idx + vie->displacement != gla) {
+	/* XXX assuming that the base address of the segment is 0 */
+	gla2 = base + vie->scale * idx + vie->displacement;
+	gla2 &= size2mask[vie->addrsize];
+	if (gla != gla2) {
 		printf("verify_gla mismatch: "
 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
-		       "disp(0x%0lx), gla(0x%0lx)\n",
-		       base, vie->scale, idx, vie->displacement, gla);
+		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla, gla2);
 		return (-1);
 	}
 
@@ -1347,13 +1730,11 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 
 int
 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
-		       enum vm_cpu_mode cpu_mode, struct vie *vie)
+		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 {
 
-	if (cpu_mode == CPU_MODE_64BIT) {
-		if (decode_rex(vie))
-			return (-1);
-	}
+	if (decode_prefixes(vie, cpu_mode, cs_d))
+		return (-1);
 
 	if (decode_opcode(vie))
 		return (-1);
@@ -1366,10 +1747,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 
 	if (decode_displacement(vie))
 		return (-1);
-	
+
 	if (decode_immediate(vie))
 		return (-1);
 
+	if (decode_moffset(vie))
+		return (-1);
+
 	if (verify_inst_length(vie))
 		return (-1);
author	grehan <grehan@FreeBSD.org>	2014-08-19 01:20:24 +0000
committer	grehan <grehan@FreeBSD.org>	2014-08-19 01:20:24 +0000
commit	5d455a50f5226eff9d453943eb0b714689989802 (patch)
tree	6aaf5296bf1aa8632ab9beb5fe4a61727ef7ab68 /sys/amd64
parent	4791bac9b41a259cdb9821e381df8837926220e2 (diff)
download	FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.zip FreeBSD-src-5d455a50f5226eff9d453943eb0b714689989802.tar.gz