MFC r273375

Add support AMD processors with the SVM/AMD-V hardware extensions. MFC r273749 Remove bhyve SVM feature printf's now that they are available in the general CPU feature detection code. MFC r273766 Add missing 'break' pointed out by Coverity CID 1249760. MFC r276098 Allow ktr(4) tracing of all guest exceptions via the tunable "hw.vmm.trace_guest_exceptions" MFC r276392 Inject #UD into the guest when it executes either 'MONITOR' or 'MWAIT' on an AMD/SVM host. MFC r276402 Remove "svn:mergeinfo" property that was dragged along when these files were svn copied in r273375.
author: neel <neel@FreeBSD.org> 2014-12-30 08:24:14 +0000
committer: neel <neel@FreeBSD.org> 2014-12-30 08:24:14 +0000
commit: 9a7db864f78c4821164e142b15574dd789e438fc (patch)
tree: 5ca9ce0dfb46b18acadddbaeeda0e8409ebb1eac /sys/amd64
parent: 6d931c08fa2abce728837379fb2549a6513f49b0 (diff)
download: FreeBSD-src-9a7db864f78c4821164e142b15574dd789e438fc.zip
FreeBSD-src-9a7db864f78c4821164e142b15574dd789e438fc.tar.gz
21 files changed, 3782 insertions, 175 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 0879ba2..6f769b9 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -357,6 +357,8 @@ void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     void *kaddr, size_t len);
 void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len);
+
+int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
@@ -487,6 +489,7 @@ enum vm_exitcode {
 	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MONITOR,
 	VM_EXITCODE_MWAIT,
+	VM_EXITCODE_SVM,
 	VM_EXITCODE_MAX
 };
 
@@ -564,6 +567,14 @@ struct vm_exit {
 			int		inst_type;
 			int		inst_error;
 		} vmx;
+		/*
+		 * SVM specific payload.
+		 */
+		struct {
+			uint64_t	exitcode;
+			uint64_t	exitinfo1;
+			uint64_t	exitinfo2;
+		} svm;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index bbd3d88..516cc01 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -93,7 +93,7 @@ int vmm_fetch_instruction(struct vm *vm, int cpuid,
 int vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa);
 
-void vie_init(struct vie *vie);
+void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
 
 /*
  * Decode the instruction fetched into 'vie' so it can be emulated.
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index 4c88d12..ee121b0 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -38,149 +38,6 @@ __FBSDID("$FreeBSD$");
 #include "io/iommu.h"
 
 static int
-amdv_init(int ipinum)
-{
-
-	printf("amdv_init: not implemented\n");
-	return (ENXIO);
-}
-
-static int
-amdv_cleanup(void)
-{
-
-	printf("amdv_cleanup: not implemented\n");
-	return (ENXIO);
-}
-
-static void
-amdv_resume(void)
-{
-}
-
-static void *
-amdv_vminit(struct vm *vm, struct pmap *pmap)
-{
-
-	printf("amdv_vminit: not implemented\n");
-	return (NULL);
-}
-
-static int
-amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap,
-    void *rptr, void *sptr)
-{
-
-	printf("amdv_vmrun: not implemented\n");
-	return (ENXIO);
-}
-
-static void
-amdv_vmcleanup(void *arg)
-{
-
-	printf("amdv_vmcleanup: not implemented\n");
-	return;
-}
-
-static int
-amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
-{
-	
-	printf("amdv_getreg: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
-{
-	
-	printf("amdv_setreg: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
-{
-
-	printf("amdv_get_desc: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
-{
-
-	printf("amdv_get_desc: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_getcap(void *arg, int vcpu, int type, int *retval)
-{
-
-	printf("amdv_getcap: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_setcap(void *arg, int vcpu, int type, int val)
-{
-
-	printf("amdv_setcap: not implemented\n");
-	return (EINVAL);
-}
-
-static struct vmspace *
-amdv_vmspace_alloc(vm_offset_t min, vm_offset_t max)
-{
-
-	printf("amdv_vmspace_alloc: not implemented\n");
-	return (NULL);
-}
-
-static void
-amdv_vmspace_free(struct vmspace *vmspace)
-{
-
-	printf("amdv_vmspace_free: not implemented\n");
-	return;
-}
-
-static struct vlapic *
-amdv_vlapic_init(void *arg, int vcpuid)
-{
-
-	panic("amdv_vlapic_init: not implmented");
-}
-
-static void
-amdv_vlapic_cleanup(void *arg, struct vlapic *vlapic)
-{
-
-	panic("amdv_vlapic_cleanup: not implemented");
-}
-
-struct vmm_ops vmm_ops_amd = {
-	amdv_init,
-	amdv_cleanup,
-	amdv_resume,
-	amdv_vminit,
-	amdv_vmrun,
-	amdv_vmcleanup,
-	amdv_getreg,
-	amdv_setreg,
-	amdv_getdesc,
-	amdv_setdesc,
-	amdv_getcap,
-	amdv_setcap,
-	amdv_vmspace_alloc,
-	amdv_vmspace_free,
-	amdv_vlapic_init,
-	amdv_vlapic_cleanup,
-};
-
-static int
 amd_iommu_init(void)
 {
 
diff --git a/sys/amd64/vmm/amd/npt.c b/sys/amd64/vmm/amd/npt.c
new file mode 100644
index 0000000..bebb4d5
--- /dev/null
+++ b/sys/amd64/vmm/amd/npt.c
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+
+#include <machine/pmap.h>
+
+#include "npt.h"
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL);
+
+static int npt_flags;
+SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD,
+	&npt_flags, 0, NULL);
+
+#define NPT_IPIMASK	0xFF
+
+/*
+ * AMD nested page table init.
+ */
+int
+svm_npt_init(int ipinum)
+{
+	int enable_superpage = 1;
+
+	npt_flags = ipinum & NPT_IPIMASK;
+	TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage);
+	if (enable_superpage)
+		npt_flags |= PMAP_PDE_SUPERPAGE; 
+	
+	return (0);
+}
+
+static int
+npt_pinit(pmap_t pmap)
+{
+
+	return (pmap_pinit_type(pmap, PT_RVI, npt_flags));
+}
+
+struct vmspace *
+svm_npt_alloc(vm_offset_t min, vm_offset_t max)
+{
+	
+	return (vmspace_alloc(min, max, npt_pinit));
+}
+
+void
+svm_npt_free(struct vmspace *vmspace)
+{
+
+	vmspace_free(vmspace);
+}
diff --git a/sys/amd64/vmm/amd/npt.h b/sys/amd64/vmm/amd/npt.h
new file mode 100644
index 0000000..5966474
--- /dev/null
+++ b/sys/amd64/vmm/amd/npt.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_NPT_H_
+#define _SVM_NPT_H_
+
+int 	svm_npt_init(int ipinum);
+struct	vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max);
+void	svm_npt_free(struct vmspace *vmspace);
+
+#endif /* _SVM_NPT_H_ */
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
new file mode 100644
index 0000000..ab47041
--- /dev/null
+++ b/sys/amd64/vmm/amd/svm.c
@@ -0,0 +1,2172 @@
+/*-
+ * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/cpufunc.h>
+#include <machine/psl.h>
+#include <machine/pmap.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#include <machine/smp.h>
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_ktr.h"
+#include "vmm_ioport.h"
+#include "vatpic.h"
+#include "vlapic.h"
+#include "vlapic_priv.h"
+
+#include "x86.h"
+#include "vmcb.h"
+#include "svm.h"
+#include "svm_softc.h"
+#include "svm_msr.h"
+#include "npt.h"
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
+
+/*
+ * SVM CPUID function 0x8000_000A, edx bit decoding.
+ */
+#define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
+#define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
+#define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
+#define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
+#define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
+#define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
+#define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
+#define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
+#define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
+#define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
+
+#define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
+				VMCB_CACHE_IOPM		|	\
+				VMCB_CACHE_I		|	\
+				VMCB_CACHE_TPR		|	\
+				VMCB_CACHE_CR2		|	\
+				VMCB_CACHE_CR		|	\
+				VMCB_CACHE_DT		|	\
+				VMCB_CACHE_SEG		|	\
+				VMCB_CACHE_NP)
+
+static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
+SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
+    0, NULL);
+
+static MALLOC_DEFINE(M_SVM, "svm", "svm");
+static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
+
+/* Per-CPU context area. */
+extern struct pcpu __pcpu[];
+
+static uint32_t svm_feature;	/* AMD SVM features. */
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0,
+    "SVM features advertised by CPUID.8000000AH:EDX");
+
+static int disable_npf_assist;
+SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
+    &disable_npf_assist, 0, NULL);
+
+/* Maximum ASIDs supported by the processor */
+static uint32_t nasid;
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0,
+    "Number of ASIDs supported by this processor");
+
+/* Current ASID generation for each host cpu */
+static struct asid asid[MAXCPU];
+
+/* 
+ * SVM host state saved area of size 4KB for each core.
+ */
+static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
+static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
+static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
+
+static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
+
+static __inline int
+flush_by_asid(void)
+{
+
+	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
+}
+
+static __inline int
+decode_assist(void)
+{
+
+	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
+}
+
+static void
+svm_disable(void *arg __unused)
+{
+	uint64_t efer;
+
+	efer = rdmsr(MSR_EFER);
+	efer &= ~EFER_SVM;
+	wrmsr(MSR_EFER, efer);
+}
+
+/*
+ * Disable SVM on all CPUs.
+ */
+static int
+svm_cleanup(void)
+{
+
+	smp_rendezvous(NULL, svm_disable, NULL, NULL);
+	return (0);
+}
+
+/*
+ * Verify that all the features required by bhyve are available.
+ */
+static int
+check_svm_features(void)
+{
+	u_int regs[4];
+
+	/* CPUID Fn8000_000A is for SVM */
+	do_cpuid(0x8000000A, regs);
+	svm_feature = regs[3];
+
+	nasid = regs[1];
+	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
+
+	/* bhyve requires the Nested Paging feature */
+	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
+		printf("SVM: Nested Paging feature not available.\n");
+		return (ENXIO);
+	}
+
+	/* bhyve requires the NRIP Save feature */
+	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
+		printf("SVM: NRIP Save feature not available.\n");
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+static void
+svm_enable(void *arg __unused)
+{
+	uint64_t efer;
+
+	efer = rdmsr(MSR_EFER);
+	efer |= EFER_SVM;
+	wrmsr(MSR_EFER, efer);
+
+	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
+}
+
+/*
+ * Return 1 if SVM is enabled on this processor and 0 otherwise.
+ */
+static int
+svm_available(void)
+{
+	uint64_t msr;
+
+	/* Section 15.4 Enabling SVM from APM2. */
+	if ((amd_feature2 & AMDID2_SVM) == 0) {
+		printf("SVM: not available.\n");
+		return (0);
+	}
+
+	msr = rdmsr(MSR_VM_CR);
+	if ((msr & VM_CR_SVMDIS) != 0) {
+		printf("SVM: disabled by BIOS.\n");
+		return (0);
+	}
+
+	return (1);
+}
+
+static int
+svm_init(int ipinum)
+{
+	int error, cpu;
+
+	if (!svm_available())
+		return (ENXIO);
+
+	error = check_svm_features();
+	if (error)
+		return (error);
+
+	vmcb_clean &= VMCB_CACHE_DEFAULT;
+
+	for (cpu = 0; cpu < MAXCPU; cpu++) {
+		/*
+		 * Initialize the host ASIDs to their "highest" valid values.
+		 *
+		 * The next ASID allocation will rollover both 'gen' and 'num'
+		 * and start off the sequence at {1,1}.
+		 */
+		asid[cpu].gen = ~0UL;
+		asid[cpu].num = nasid - 1;
+	}
+
+	svm_msr_init();
+	svm_npt_init(ipinum);
+
+	/* Enable SVM on all CPUs */
+	smp_rendezvous(NULL, svm_enable, NULL, NULL);
+
+	return (0);
+}
+
+static void
+svm_restore(void)
+{
+
+	svm_enable(NULL);
+}		
+
+/* Pentium compatible MSRs */
+#define MSR_PENTIUM_START 	0	
+#define MSR_PENTIUM_END 	0x1FFF
+/* AMD 6th generation and Intel compatible MSRs */
+#define MSR_AMD6TH_START 	0xC0000000UL	
+#define MSR_AMD6TH_END 		0xC0001FFFUL	
+/* AMD 7th and 8th generation compatible MSRs */
+#define MSR_AMD7TH_START 	0xC0010000UL	
+#define MSR_AMD7TH_END 		0xC0011FFFUL	
+
+/*
+ * Get the index and bit position for a MSR in permission bitmap.
+ * Two bits are used for each MSR: lower bit for read and higher bit for write.
+ */
+static int
+svm_msr_index(uint64_t msr, int *index, int *bit)
+{
+	uint32_t base, off;
+
+	*index = -1;
+	*bit = (msr % 4) * 2;
+	base = 0;
+
+	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
+		*index = msr / 4;
+		return (0);
+	}
+
+	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 
+	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
+		off = (msr - MSR_AMD6TH_START); 
+		*index = (off + base) / 4;
+		return (0);
+	} 
+
+	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
+	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
+		off = (msr - MSR_AMD7TH_START);
+		*index = (off + base) / 4;
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+/*
+ * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
+ */
+static void
+svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
+{
+	int index, bit, error;
+
+	error = svm_msr_index(msr, &index, &bit);
+	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
+	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
+	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
+	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
+	    "msr %#lx", __func__, bit, msr));
+
+	if (read)
+		perm_bitmap[index] &= ~(1UL << bit);
+
+	if (write)
+		perm_bitmap[index] &= ~(2UL << bit);
+}
+
+static void
+svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
+{
+
+	svm_msr_perm(perm_bitmap, msr, true, true);
+}
+
+static void
+svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
+{
+
+	svm_msr_perm(perm_bitmap, msr, true, false);
+}
+
+static __inline int
+svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
+{
+	struct vmcb_ctrl *ctrl;
+
+	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
+}
+
+static __inline void
+svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
+    int enabled)
+{
+	struct vmcb_ctrl *ctrl;
+	uint32_t oldval;
+
+	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	oldval = ctrl->intercept[idx];
+
+	if (enabled)
+		ctrl->intercept[idx] |= bitmask;
+	else
+		ctrl->intercept[idx] &= ~bitmask;
+
+	if (ctrl->intercept[idx] != oldval) {
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
+		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
+		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
+	}
+}
+
+static __inline void
+svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
+{
+
+	svm_set_intercept(sc, vcpu, off, bitmask, 0);
+}
+
+static __inline void
+svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
+{
+
+	svm_set_intercept(sc, vcpu, off, bitmask, 1);
+}
+
+static void
+vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
+    uint64_t msrpm_base_pa, uint64_t np_pml4)
+{
+	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
+	uint32_t mask;
+	int n;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	state = svm_get_vmcb_state(sc, vcpu);
+
+	ctrl->iopm_base_pa = iopm_base_pa;
+	ctrl->msrpm_base_pa = msrpm_base_pa;
+
+	/* Enable nested paging */
+	ctrl->np_enable = 1;
+	ctrl->n_cr3 = np_pml4;
+
+	/*
+	 * Intercept accesses to the control registers that are not shadowed
+	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
+	 */
+	for (n = 0; n < 16; n++) {
+		mask = (BIT(n) << 16) | BIT(n);
+		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
+			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
+		else
+			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
+	}
+
+
+	/*
+	 * Intercept everything when tracing guest exceptions otherwise
+	 * just intercept machine check exception.
+	 */
+	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
+		for (n = 0; n < 32; n++) {
+			/*
+			 * Skip unimplemented vectors in the exception bitmap.
+			 */
+			if (n == 2 || n == 9) {
+				continue;
+			}
+			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
+		}
+	} else {
+		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
+	}
+
+	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+	    VMCB_INTCPT_FERR_FREEZE);
+
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
+
+	/*
+	 * From section "Canonicalization and Consistency Checks" in APMv2
+	 * the VMRUN intercept bit must be set to pass the consistency check.
+	 */
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
+
+	/*
+	 * The ASID will be set to a non-zero value just before VMRUN.
+	 */
+	ctrl->asid = 0;
+
+	/*
+	 * Section 15.21.1, Interrupt Masking in EFLAGS
+	 * Section 15.21.2, Virtualizing APIC.TPR
+	 *
+	 * This must be set for %rflag and %cr8 isolation of guest and host.
+	 */
+	ctrl->v_intr_masking = 1;
+
+	/* Enable Last Branch Record aka LBR for debugging */
+	ctrl->lbr_virt_en = 1;
+	state->dbgctl = BIT(0);
+
+	/* EFER_SVM must always be set when the guest is executing */
+	state->efer = EFER_SVM;
+
+	/* Set up the PAT to power-on state */
+	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
+	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(2, PAT_UNCACHED)		|
+	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	    PAT_VALUE(4, PAT_WRITE_BACK)	|
+	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(6, PAT_UNCACHED)		|
+	    PAT_VALUE(7, PAT_UNCACHEABLE);
+}
+
+/*
+ * Initialize a virtual machine.
+ */
+static void *
+svm_vminit(struct vm *vm, pmap_t pmap)
+{
+	struct svm_softc *svm_sc;
+	struct svm_vcpu *vcpu;
+	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;	
+	int i;
+
+	svm_sc = malloc(sizeof (struct svm_softc), M_SVM, M_WAITOK | M_ZERO);
+	svm_sc->vm = vm;
+	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
+
+	/*
+	 * Intercept read and write accesses to all MSRs.
+	 */
+	memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap));
+
+	/*
+	 * Access to the following MSRs is redirected to the VMCB when the
+	 * guest is executing. Therefore it is safe to allow the guest to
+	 * read/write these MSRs directly without hypervisor involvement.
+	 */
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
+	
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
+
+	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
+
+	/*
+	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
+	 */
+	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
+
+	/* Intercept access to all I/O ports. */
+	memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap));
+
+	iopm_pa = vtophys(svm_sc->iopm_bitmap);
+	msrpm_pa = vtophys(svm_sc->msr_bitmap);
+	pml4_pa = svm_sc->nptp;
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu = svm_get_vcpu(svm_sc, i);
+		vcpu->lastcpu = NOCPU;
+		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
+		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
+		svm_msr_guest_init(svm_sc, i);
+	}
+	return (svm_sc);
+}
+
+static int
+svm_cpl(struct vmcb_state *state)
+{
+
+	/*
+	 * From APMv2:
+	 *   "Retrieve the CPL from the CPL field in the VMCB, not
+	 *    from any segment DPL"
+	 */
+	return (state->cpl);
+}
+
+static enum vm_cpu_mode
+svm_vcpu_mode(struct vmcb *vmcb)
+{
+	struct vmcb_segment seg;
+	struct vmcb_state *state;
+	int error;
+
+	state = &vmcb->state;
+
+	if (state->efer & EFER_LMA) {
+		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
+		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
+		    error));
+
+		/*
+		 * Section 4.8.1 for APM2, check if Code Segment has
+		 * Long attribute set in descriptor.
+		 */
+		if (seg.attrib & VMCB_CS_ATTRIB_L)
+			return (CPU_MODE_64BIT);
+		else
+			return (CPU_MODE_COMPATIBILITY);
+	} else  if (state->cr0 & CR0_PE) {
+		return (CPU_MODE_PROTECTED);
+	} else {
+		return (CPU_MODE_REAL);
+	}
+}
+
+static enum vm_paging_mode
+svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
+{
+
+	if ((cr0 & CR0_PG) == 0)
+		return (PAGING_MODE_FLAT);
+	if ((cr4 & CR4_PAE) == 0)
+		return (PAGING_MODE_32);
+	if (efer & EFER_LME)
+		return (PAGING_MODE_64);
+	else
+		return (PAGING_MODE_PAE);
+}
+
+/*
+ * ins/outs utility routines
+ */
+static uint64_t
+svm_inout_str_index(struct svm_regctx *regs, int in)
+{
+	uint64_t val;
+
+	val = in ? regs->sctx_rdi : regs->sctx_rsi;
+
+	return (val);
+}
+
+static uint64_t
+svm_inout_str_count(struct svm_regctx *regs, int rep)
+{
+	uint64_t val;
+
+	val = rep ? regs->sctx_rcx : 1;
+
+	return (val);
+}
+
+static void
+svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
+    int in, struct vm_inout_str *vis)
+{
+	int error, s;
+
+	if (in) {
+		vis->seg_name = VM_REG_GUEST_ES;
+	} else {
+		/* The segment field has standard encoding */
+		s = (info1 >> 10) & 0x7;
+		vis->seg_name = vm_segment_name(s);
+	}
+
+	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
+	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
+}
+
+static int
+svm_inout_str_addrsize(uint64_t info1)
+{
+        uint32_t size;
+
+        size = (info1 >> 7) & 0x7;
+        switch (size) {
+        case 1:
+                return (2);     /* 16 bit */
+        case 2:
+                return (4);     /* 32 bit */
+        case 4:
+                return (8);     /* 64 bit */
+        default:
+                panic("%s: invalid size encoding %d", __func__, size);
+        }
+}
+
+static void
+svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
+{
+	struct vmcb_state *state;
+
+	state = &vmcb->state;
+	paging->cr3 = state->cr3;
+	paging->cpl = svm_cpl(state);
+	paging->cpu_mode = svm_vcpu_mode(vmcb);
+	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
+	    state->efer);
+}
+
+#define	UNHANDLED 0
+
+/*
+ * Handle guest I/O intercept.
+ */
+static int
+svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
+{
+	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
+	struct svm_regctx *regs;
+	struct vm_inout_str *vis;
+	uint64_t info1;
+	int inout_string;
+
+	state = svm_get_vmcb_state(svm_sc, vcpu);
+	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
+	regs  = svm_get_guest_regctx(svm_sc, vcpu);
+
+	info1 = ctrl->exitinfo1;
+	inout_string = info1 & BIT(2) ? 1 : 0;
+
+	/*
+	 * The effective segment number in EXITINFO1[12:10] is populated
+	 * only if the processor has the DecodeAssist capability.
+	 *
+	 * XXX this is not specified explicitly in APMv2 but can be verified
+	 * empirically.
+	 */
+	if (inout_string && !decode_assist())
+		return (UNHANDLED);
+
+	vmexit->exitcode 	= VM_EXITCODE_INOUT;
+	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
+	vmexit->u.inout.string 	= inout_string;
+	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
+	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
+	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
+	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
+
+	if (inout_string) {
+		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
+		vis = &vmexit->u.inout_str;
+		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
+		vis->rflags = state->rflags;
+		vis->cr0 = state->cr0;
+		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
+		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
+		vis->addrsize = svm_inout_str_addrsize(info1);
+		svm_inout_str_seginfo(svm_sc, vcpu, info1,
+		    vmexit->u.inout.in, vis);
+	}
+
+	return (UNHANDLED);
+}
+
+static int
+npf_fault_type(uint64_t exitinfo1)
+{
+
+	if (exitinfo1 & VMCB_NPF_INFO1_W)
+		return (VM_PROT_WRITE);
+	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
+		return (VM_PROT_EXECUTE);
+	else
+		return (VM_PROT_READ);
+}
+
+static bool
+svm_npf_emul_fault(uint64_t exitinfo1)
+{
+	
+	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
+		return (false);
+	}
+
+	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
+		return (false);
+	}
+
+	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
+		return (false);
+	}
+
+	return (true);	
+}
+
+static void
+svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
+{
+	struct vm_guest_paging *paging;
+	struct vmcb_segment seg;
+	struct vmcb_ctrl *ctrl;
+	char *inst_bytes;
+	int error, inst_len;
+
+	ctrl = &vmcb->ctrl;
+	paging = &vmexit->u.inst_emul.paging;
+
+	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+	vmexit->u.inst_emul.gpa = gpa;
+	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
+	svm_paging_info(vmcb, paging);
+
+	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
+	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
+
+	switch(paging->cpu_mode) {
+	case CPU_MODE_PROTECTED:
+	case CPU_MODE_COMPATIBILITY:
+		/*
+		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
+		 */
+		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
+		    1 : 0;
+		break;
+	default:
+		vmexit->u.inst_emul.cs_d = 0;
+		break;	
+	}
+
+	/*
+	 * Copy the instruction bytes into 'vie' if available.
+	 */
+	if (decode_assist() && !disable_npf_assist) {
+		inst_len = ctrl->inst_len;
+		inst_bytes = ctrl->inst_bytes;
+	} else {
+		inst_len = 0;
+		inst_bytes = NULL;
+	}
+	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
+}
+
+#ifdef KTR
+static const char *
+intrtype_to_str(int intr_type)
+{
+	switch (intr_type) {
+	case VMCB_EVENTINJ_TYPE_INTR:
+		return ("hwintr");
+	case VMCB_EVENTINJ_TYPE_NMI:
+		return ("nmi");
+	case VMCB_EVENTINJ_TYPE_INTn:
+		return ("swintr");
+	case VMCB_EVENTINJ_TYPE_EXCEPTION:
+		return ("exception");
+	default:
+		panic("%s: unknown intr_type %d", __func__, intr_type);
+	}
+}
+#endif
+
+/*
+ * Inject an event to vcpu as described in section 15.20, "Event injection".
+ */
+static void
+svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
+		 uint32_t error, bool ec_valid)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
+	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
+
+	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
+	    __func__, vector));
+
+	switch (intr_type) {
+	case VMCB_EVENTINJ_TYPE_INTR:
+	case VMCB_EVENTINJ_TYPE_NMI:
+	case VMCB_EVENTINJ_TYPE_INTn:
+		break;
+	case VMCB_EVENTINJ_TYPE_EXCEPTION:
+		if (vector >= 0 && vector <= 31 && vector != 2)
+			break;
+		/* FALLTHROUGH */
+	default:
+		panic("%s: invalid intr_type/vector: %d/%d", __func__,
+		    intr_type, vector);
+	}
+	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
+	if (ec_valid) {
+		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
+		ctrl->eventinj |= (uint64_t)error << 32;
+		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
+		    intrtype_to_str(intr_type), vector, error);
+	} else {
+		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
+		    intrtype_to_str(intr_type), vector);
+	}
+}
+
+static void
+svm_update_virqinfo(struct svm_softc *sc, int vcpu)
+{
+	struct vm *vm;
+	struct vlapic *vlapic;
+	struct vmcb_ctrl *ctrl;
+	int pending;
+
+	vm = sc->vm;
+	vlapic = vm_lapic(vm, vcpu);
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	/* Update %cr8 in the emulated vlapic */
+	vlapic_set_cr8(vlapic, ctrl->v_tpr);
+
+	/*
+	 * If V_IRQ indicates that the interrupt injection attempted on then
+	 * last VMRUN was successful then update the vlapic accordingly.
+	 */
+	if (ctrl->v_intr_vector != 0) {
+		pending = ctrl->v_irq;
+		KASSERT(ctrl->v_intr_vector >= 16, ("%s: invalid "
+		    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
+		KASSERT(!ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
+		VCPU_CTR2(vm, vcpu, "v_intr_vector %d %s", ctrl->v_intr_vector,
+		    pending ? "pending" : "accepted");
+		if (!pending)
+			vlapic_intr_accepted(vlapic, ctrl->v_intr_vector);
+	}
+}
+
+static void
+svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
+{
+	struct vmcb_ctrl *ctrl;
+	uint64_t intinfo;
+
+	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
+	intinfo = ctrl->exitintinfo;	
+	if (!VMCB_EXITINTINFO_VALID(intinfo))
+		return;
+
+	/*
+	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
+	 *
+	 * If a #VMEXIT happened during event delivery then record the event
+	 * that was being delivered.
+	 */
+	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
+		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
+	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
+	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
+}
+
+static __inline int
+vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
+{
+
+	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+	    VMCB_INTCPT_VINTR));
+}
+
+static __inline void
+enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
+		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
+		KASSERT(vintr_intercept_enabled(sc, vcpu),
+		    ("%s: vintr intercept should be enabled", __func__));
+		return;
+	}
+
+	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
+	ctrl->v_irq = 1;
+	ctrl->v_ign_tpr = 1;
+	ctrl->v_intr_vector = 0;
+	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
+}
+
+static __inline void
+disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
+		KASSERT(!vintr_intercept_enabled(sc, vcpu),
+		    ("%s: vintr intercept should be disabled", __func__));
+		return;
+	}
+
+#ifdef KTR
+	if (ctrl->v_intr_vector == 0)
+		VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
+	else
+		VCPU_CTR0(sc->vm, vcpu, "Clearing V_IRQ interrupt injection");
+#endif
+	ctrl->v_irq = 0;
+	ctrl->v_intr_vector = 0;
+	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
+	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
+}
+
+static int
+svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
+{
+	struct vmcb_ctrl *ctrl;
+	int oldval, newval;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	oldval = ctrl->intr_shadow;
+	newval = val ? 1 : 0;
+	if (newval != oldval) {
+		ctrl->intr_shadow = newval;
+		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
+	}
+	return (0);
+}
+
+static int
+svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	*val = ctrl->intr_shadow;
+	return (0);
+}
+
+/*
+ * Once an NMI is injected it blocks delivery of further NMIs until the handler
+ * executes an IRET. The IRET intercept is enabled when an NMI is injected to
+ * to track when the vcpu is done handling the NMI.
+ */
+static int
+nmi_blocked(struct svm_softc *sc, int vcpu)
+{
+	int blocked;
+
+	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+	    VMCB_INTCPT_IRET);
+	return (blocked);
+}
+
+static void
+enable_nmi_blocking(struct svm_softc *sc, int vcpu)
+{
+
+	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
+	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
+}
+
+static void
+clear_nmi_blocking(struct svm_softc *sc, int vcpu)
+{
+	int error;
+
+	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
+	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
+	/*
+	 * When the IRET intercept is cleared the vcpu will attempt to execute
+	 * the "iret" when it runs next. However, it is possible to inject
+	 * another NMI into the vcpu before the "iret" has actually executed.
+	 *
+	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
+	 * it will trap back into the hypervisor. If an NMI is pending for
+	 * the vcpu it will be injected into the guest.
+	 *
+	 * XXX this needs to be fixed
+	 */
+	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
+
+	/*
+	 * Set 'intr_shadow' to prevent an NMI from being injected on the
+	 * immediate VMRUN.
+	 */
+	error = svm_modify_intr_shadow(sc, vcpu, 1);
+	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
+}
+
+static int
+emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
+    bool *retu)
+{
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
+	else if (num == MSR_EFER)
+		error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, val);
+	else
+		error = svm_wrmsr(sc, vcpu, num, val, retu);
+
+	return (error);
+}
+
+static int
+emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
+{
+	struct vmcb_state *state;
+	struct svm_regctx *ctx;
+	uint64_t result;
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
+	else
+		error = svm_rdmsr(sc, vcpu, num, &result, retu);
+
+	if (error == 0) {
+		state = svm_get_vmcb_state(sc, vcpu);
+		ctx = svm_get_guest_regctx(sc, vcpu);
+		state->rax = result & 0xffffffff;
+		ctx->sctx_rdx = result >> 32;
+	}
+
+	return (error);
+}
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(uint64_t reason)
+{
+	static char reasonbuf[32];
+
+	switch (reason) {
+	case VMCB_EXIT_INVALID:
+		return ("invalvmcb");
+	case VMCB_EXIT_SHUTDOWN:
+		return ("shutdown");
+	case VMCB_EXIT_NPF:
+		return ("nptfault");
+	case VMCB_EXIT_PAUSE:
+		return ("pause");
+	case VMCB_EXIT_HLT:
+		return ("hlt");
+	case VMCB_EXIT_CPUID:
+		return ("cpuid");
+	case VMCB_EXIT_IO:
+		return ("inout");
+	case VMCB_EXIT_MC:
+		return ("mchk");
+	case VMCB_EXIT_INTR:
+		return ("extintr");
+	case VMCB_EXIT_NMI:
+		return ("nmi");
+	case VMCB_EXIT_VINTR:
+		return ("vintr");
+	case VMCB_EXIT_MSR:
+		return ("msr");
+	case VMCB_EXIT_IRET:
+		return ("iret");
+	case VMCB_EXIT_MONITOR:
+		return ("monitor");
+	case VMCB_EXIT_MWAIT:
+		return ("mwait");
+	default:
+		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
+		return (reasonbuf);
+	}
+}
+#endif	/* KTR */
+
+/*
+ * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
+ * that are due to instruction intercepts as well as MSR and IOIO intercepts
+ * and exceptions caused by INT3, INTO and BOUND instructions.
+ *
+ * Return 1 if the nRIP is valid and 0 otherwise.
+ */
+static int
+nrip_valid(uint64_t exitcode)
+{
+	switch (exitcode) {
+	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
+	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
+	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
+	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
+	case 0x43:		/* INT3 */
+	case 0x44:		/* INTO */
+	case 0x45:		/* BOUND */
+	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
+	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
+		return (1);
+	default:
+		return (0);
+	}
+}
+
+/*
+ * Collateral for a generic SVM VM-exit.
+ */
+static void
+vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
+{
+
+	vme->exitcode = VM_EXITCODE_SVM;
+	vme->u.svm.exitcode = code;
+	vme->u.svm.exitinfo1 = info1;
+	vme->u.svm.exitinfo2 = info2;
+}
+
+static int
+svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
+{
+	struct vmcb *vmcb;
+	struct vmcb_state *state;
+	struct vmcb_ctrl *ctrl;
+	struct svm_regctx *ctx;
+	struct vm_exception exception;
+	uint64_t code, info1, info2, val;
+	uint32_t eax, ecx, edx;
+	int error, errcode_valid, handled, idtvec, reflect;
+	bool retu;
+
+	ctx = svm_get_guest_regctx(svm_sc, vcpu);
+	vmcb = svm_get_vmcb(svm_sc, vcpu);
+	state = &vmcb->state;
+	ctrl = &vmcb->ctrl;
+
+	handled = 0;
+	code = ctrl->exitcode;
+	info1 = ctrl->exitinfo1;
+	info2 = ctrl->exitinfo2;
+
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+	vmexit->rip = state->rip;
+	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
+
+	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
+
+	/*
+	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
+	 * in an inconsistent state and can trigger assertions that would
+	 * never happen otherwise.
+	 */
+	if (code == VMCB_EXIT_INVALID) {
+		vm_exit_svm(vmexit, code, info1, info2);
+		return (0);
+	}
+
+	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
+	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
+
+	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
+	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
+	    vmexit->inst_length, code, info1, info2));
+
+	svm_update_virqinfo(svm_sc, vcpu);
+	svm_save_intinfo(svm_sc, vcpu);
+
+	switch (code) {
+	case VMCB_EXIT_IRET:
+		/*
+		 * Restart execution at "iret" but with the intercept cleared.
+		 */
+		vmexit->inst_length = 0;
+		clear_nmi_blocking(svm_sc, vcpu);
+		handled = 1;
+		break;
+	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
+		handled = 1;
+		break;
+	case VMCB_EXIT_INTR:	/* external interrupt */
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
+		handled = 1;
+		break;
+	case VMCB_EXIT_NMI:	/* external NMI */
+		handled = 1;
+		break;
+	case 0x40 ... 0x5F:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
+		reflect = 1;
+		idtvec = code - 0x40;
+		switch (idtvec) {
+		case IDT_MC:
+			/*
+			 * Call the machine check handler by hand. Also don't
+			 * reflect the machine check back into the guest.
+			 */
+			reflect = 0;
+			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
+			__asm __volatile("int $18");
+			break;
+		case IDT_PF:
+			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
+			    info2);
+			KASSERT(error == 0, ("%s: error %d updating cr2",
+			    __func__, error));
+			/* fallthru */
+		case IDT_NP:
+		case IDT_SS:
+		case IDT_GP:
+		case IDT_AC:
+		case IDT_TS:
+			errcode_valid = 1;
+			break;
+
+		case IDT_DF:
+			errcode_valid = 1;
+			info1 = 0;
+			break;
+
+		case IDT_BP:
+		case IDT_OF:
+		case IDT_BR:
+			/*
+			 * The 'nrip' field is populated for INT3, INTO and
+			 * BOUND exceptions and this also implies that
+			 * 'inst_length' is non-zero.
+			 *
+			 * Reset 'inst_length' to zero so the guest %rip at
+			 * event injection is identical to what it was when
+			 * the exception originally happened.
+			 */
+			VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
+			    "to zero before injecting exception %d",
+			    vmexit->inst_length, idtvec);
+			vmexit->inst_length = 0;
+			/* fallthru */
+		default:
+			errcode_valid = 0;
+			break;
+		}
+		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
+		    "when reflecting exception %d into guest",
+		    vmexit->inst_length, idtvec));
+
+		if (reflect) {
+			/* Reflect the exception back into the guest */
+			exception.vector = idtvec;
+			exception.error_code_valid = errcode_valid;
+			exception.error_code = errcode_valid ? info1 : 0;
+			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
+			    "%d/%#x into the guest", exception.vector,
+			    exception.error_code);
+			error = vm_inject_exception(svm_sc->vm, vcpu,
+			    &exception);
+			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
+			    __func__, error));
+		}
+		handled = 1;
+		break;
+	case VMCB_EXIT_MSR:	/* MSR access. */
+		eax = state->rax;
+		ecx = ctx->sctx_rcx;
+		edx = ctx->sctx_rdx;
+		retu = false;	
+
+		if (info1) {
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
+			val = (uint64_t)edx << 32 | eax;
+			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
+			    ecx, val);
+			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
+				vmexit->exitcode = VM_EXITCODE_WRMSR;
+				vmexit->u.msr.code = ecx;
+				vmexit->u.msr.wval = val;
+			} else if (!retu) {
+				handled = 1;
+			} else {
+				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+				    ("emulate_wrmsr retu with bogus exitcode"));
+			}
+		} else {
+			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
+			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
+				vmexit->exitcode = VM_EXITCODE_RDMSR;
+				vmexit->u.msr.code = ecx;
+			} else if (!retu) {
+				handled = 1;
+			} else {
+				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+				    ("emulate_rdmsr retu with bogus exitcode"));
+			}
+		}
+		break;
+	case VMCB_EXIT_IO:
+		handled = svm_handle_io(svm_sc, vcpu, vmexit);
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
+		break;
+	case VMCB_EXIT_CPUID:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
+		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
+		    (uint32_t *)&state->rax,
+		    (uint32_t *)&ctx->sctx_rbx,
+		    (uint32_t *)&ctx->sctx_rcx,
+		    (uint32_t *)&ctx->sctx_rdx);
+		break;
+	case VMCB_EXIT_HLT:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
+		vmexit->exitcode = VM_EXITCODE_HLT;
+		vmexit->u.hlt.rflags = state->rflags;
+		break;
+	case VMCB_EXIT_PAUSE:
+		vmexit->exitcode = VM_EXITCODE_PAUSE;
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
+		break;
+	case VMCB_EXIT_NPF:
+		/* EXITINFO2 contains the faulting guest physical address */
+		if (info1 & VMCB_NPF_INFO1_RSV) {
+			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
+			    "reserved bits set: info1(%#lx) info2(%#lx)",
+			    info1, info2);
+		} else if (vm_mem_allocated(svm_sc->vm, info2)) {
+			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->u.paging.gpa = info2;
+			vmexit->u.paging.fault_type = npf_fault_type(info1);
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
+			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
+			    "on gpa %#lx/%#lx at rip %#lx",
+			    info2, info1, state->rip);
+		} else if (svm_npf_emul_fault(info1)) {
+			svm_handle_inst_emul(vmcb, info2, vmexit);
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
+			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
+			    "for gpa %#lx/%#lx at rip %#lx",
+			    info2, info1, state->rip);
+		}
+		break;
+	case VMCB_EXIT_MONITOR:
+		vmexit->exitcode = VM_EXITCODE_MONITOR;
+		break;
+	case VMCB_EXIT_MWAIT:
+		vmexit->exitcode = VM_EXITCODE_MWAIT;
+		break;
+	default:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
+		break;
+	}	
+
+	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
+	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
+	    vmexit->rip, vmexit->inst_length);
+
+	if (handled) {
+		vmexit->rip += vmexit->inst_length;
+		vmexit->inst_length = 0;
+		state->rip = vmexit->rip;
+	} else {
+		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+			/*
+			 * If this VM exit was not claimed by anybody then
+			 * treat it as a generic SVM exit.
+			 */
+			vm_exit_svm(vmexit, code, info1, info2);
+		} else {
+			/*
+			 * The exitcode and collateral have been populated.
+			 * The VM exit will be processed further in userland.
+			 */
+		}
+	}
+	return (handled);
+}
+
+static void
+svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
+{
+	uint64_t intinfo;
+
+	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
+		return;
+
+	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
+	    "valid: %#lx", __func__, intinfo));
+
+	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
+		VMCB_EXITINTINFO_VECTOR(intinfo),
+		VMCB_EXITINTINFO_EC(intinfo),
+		VMCB_EXITINTINFO_EC_VALID(intinfo));
+	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
+	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
+}
+
+/*
+ * Inject event to virtual cpu.
+ */
+static void
+svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
+{
+	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
+	uint8_t v_tpr;
+	int vector, need_intr_window, pending_apic_vector;
+
+	state = svm_get_vmcb_state(sc, vcpu);
+	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
+
+	need_intr_window = 0;
+	pending_apic_vector = 0;
+
+	/*
+	 * Inject pending events or exceptions for this vcpu.
+	 *
+	 * An event might be pending because the previous #VMEXIT happened
+	 * during event delivery (i.e. ctrl->exitintinfo).
+	 *
+	 * An event might also be pending because an exception was injected
+	 * by the hypervisor (e.g. #PF during instruction emulation).
+	 */
+	svm_inj_intinfo(sc, vcpu);
+
+	/* NMI event has priority over interrupts. */
+	if (vm_nmi_pending(sc->vm, vcpu)) {
+		if (nmi_blocked(sc, vcpu)) {
+			/*
+			 * Can't inject another NMI if the guest has not
+			 * yet executed an "iret" after the last NMI.
+			 */
+			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
+			    "to NMI-blocking");
+		} else if (ctrl->intr_shadow) {
+			/*
+			 * Can't inject an NMI if the vcpu is in an intr_shadow.
+			 */
+			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
+			    "interrupt shadow");
+			need_intr_window = 1;
+			goto done;
+		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
+			/*
+			 * If there is already an exception/interrupt pending
+			 * then defer the NMI until after that.
+			 */
+			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
+			    "eventinj %#lx", ctrl->eventinj);
+
+			/*
+			 * Use self-IPI to trigger a VM-exit as soon as
+			 * possible after the event injection is completed.
+			 *
+			 * This works only if the external interrupt exiting
+			 * is at a lower priority than the event injection.
+			 *
+			 * Although not explicitly specified in APMv2 the
+			 * relative priorities were verified empirically.
+			 */
+			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
+		} else {
+			vm_nmi_clear(sc->vm, vcpu);
+
+			/* Inject NMI, vector number is not used */
+			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
+			    IDT_NMI, 0, false);
+
+			/* virtual NMI blocking is now in effect */
+			enable_nmi_blocking(sc, vcpu);
+
+			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
+		}
+	}
+
+	if (!vm_extint_pending(sc->vm, vcpu)) {
+		/*
+		 * APIC interrupts are delivered using the V_IRQ offload.
+		 *
+		 * The primary benefit is that the hypervisor doesn't need to
+		 * deal with the various conditions that inhibit interrupts.
+		 * It also means that TPR changes via CR8 will be handled
+		 * without any hypervisor involvement.
+		 *
+		 * Note that the APIC vector must remain pending in the vIRR
+		 * until it is confirmed that it was delivered to the guest.
+		 * This can be confirmed based on the value of V_IRQ at the
+		 * next #VMEXIT (1 = pending, 0 = delivered).
+		 *
+		 * Also note that it is possible that another higher priority
+		 * vector can become pending before this vector is delivered
+		 * to the guest. This is alright because vcpu_notify_event()
+		 * will send an IPI and force the vcpu to trap back into the
+		 * hypervisor. The higher priority vector will be injected on
+		 * the next VMRUN.
+		 */
+		if (vlapic_pending_intr(vlapic, &vector)) {
+			KASSERT(vector >= 16 && vector <= 255,
+			    ("invalid vector %d from local APIC", vector));
+			pending_apic_vector = vector;
+		}
+		goto done;
+	}
+
+	/* Ask the legacy pic for a vector to inject */
+	vatpic_pending_intr(sc->vm, &vector);
+	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR",
+	    vector));
+
+	/*
+	 * If the guest has disabled interrupts or is in an interrupt shadow
+	 * then we cannot inject the pending interrupt.
+	 */
+	if ((state->rflags & PSL_I) == 0) {
+		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
+		    "rflags %#lx", vector, state->rflags);
+		need_intr_window = 1;
+		goto done;
+	}
+
+	if (ctrl->intr_shadow) {
+		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
+		    "interrupt shadow", vector);
+		need_intr_window = 1;
+		goto done;
+	}
+
+	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
+		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
+		    "eventinj %#lx", vector, ctrl->eventinj);
+		need_intr_window = 1;
+		goto done;
+	}
+
+	/*
+	 * Legacy PIC interrupts are delivered via the event injection
+	 * mechanism.
+	 */
+	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
+
+	vm_extint_clear(sc->vm, vcpu);
+	vatpic_intr_accepted(sc->vm, vector);
+
+	/*
+	 * Force a VM-exit as soon as the vcpu is ready to accept another
+	 * interrupt. This is done because the PIC might have another vector
+	 * that it wants to inject. Also, if the APIC has a pending interrupt
+	 * that was preempted by the ExtInt then it allows us to inject the
+	 * APIC vector as soon as possible.
+	 */
+	need_intr_window = 1;
+done:
+	/*
+	 * The guest can modify the TPR by writing to %CR8. In guest mode
+	 * the processor reflects this write to V_TPR without hypervisor
+	 * intervention.
+	 *
+	 * The guest can also modify the TPR by writing to it via the memory
+	 * mapped APIC page. In this case, the write will be emulated by the
+	 * hypervisor. For this reason V_TPR must be updated before every
+	 * VMRUN.
+	 */
+	v_tpr = vlapic_get_cr8(vlapic);
+	KASSERT(v_tpr >= 0 && v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
+	if (ctrl->v_tpr != v_tpr) {
+		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
+		    ctrl->v_tpr, v_tpr);
+		ctrl->v_tpr = v_tpr;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
+	}
+
+	if (pending_apic_vector) {
+		/*
+		 * If an APIC vector is being injected then interrupt window
+		 * exiting is not possible on this VMRUN.
+		 */
+		KASSERT(!need_intr_window, ("intr_window exiting impossible"));
+		VCPU_CTR1(sc->vm, vcpu, "Injecting vector %d using V_IRQ",
+		    pending_apic_vector);
+
+		ctrl->v_irq = 1;
+		ctrl->v_ign_tpr = 0;
+		ctrl->v_intr_vector = pending_apic_vector;
+		ctrl->v_intr_prio = pending_apic_vector >> 4;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
+	} else if (need_intr_window) {
+		/*
+		 * We use V_IRQ in conjunction with the VINTR intercept to
+		 * trap into the hypervisor as soon as a virtual interrupt
+		 * can be delivered.
+		 *
+		 * Since injected events are not subject to intercept checks
+		 * we need to ensure that the V_IRQ is not actually going to
+		 * be delivered on VM entry. The KASSERT below enforces this.
+		 */
+		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
+		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
+		    ("Bogus intr_window_exiting: eventinj (%#lx), "
+		    "intr_shadow (%u), rflags (%#lx)",
+		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
+		enable_intr_window_exiting(sc, vcpu);
+	} else {
+		disable_intr_window_exiting(sc, vcpu);
+	}
+}
+
+static __inline void
+restore_host_tss(void)
+{
+	struct system_segment_descriptor *tss_sd;
+
+	/*
+	 * The TSS descriptor was in use prior to launching the guest so it
+	 * has been marked busy.
+	 *
+	 * 'ltr' requires the descriptor to be marked available so change the
+	 * type to "64-bit available TSS".
+	 */
+	tss_sd = PCPU_GET(tss);
+	tss_sd->sd_type = SDT_SYSTSS;
+	ltr(GSEL(GPROC0_SEL, SEL_KPL));
+}
+
+static void
+check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
+{
+	struct svm_vcpu *vcpustate;
+	struct vmcb_ctrl *ctrl;
+	long eptgen;
+	bool alloc_asid;
+
+	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
+	    "active on cpu %u", __func__, thiscpu));
+
+	vcpustate = svm_get_vcpu(sc, vcpuid);
+	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
+
+	/*
+	 * The TLB entries associated with the vcpu's ASID are not valid
+	 * if either of the following conditions is true:
+	 *
+	 * 1. The vcpu's ASID generation is different than the host cpu's
+	 *    ASID generation. This happens when the vcpu migrates to a new
+	 *    host cpu. It can also happen when the number of vcpus executing
+	 *    on a host cpu is greater than the number of ASIDs available.
+	 *
+	 * 2. The pmap generation number is different than the value cached in
+	 *    the 'vcpustate'. This happens when the host invalidates pages
+	 *    belonging to the guest.
+	 *
+	 *	asidgen		eptgen	      Action
+	 *	mismatch	mismatch
+	 *	   0		   0		(a)
+	 *	   0		   1		(b1) or (b2)
+	 *	   1		   0		(c)
+	 *	   1		   1		(d)
+	 *
+	 * (a) There is no mismatch in eptgen or ASID generation and therefore
+	 *     no further action is needed.
+	 *
+	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
+	 *      retained and the TLB entries associated with this ASID
+	 *      are flushed by VMRUN.
+	 *
+	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
+	 *      allocated.
+	 *
+	 * (c) A new ASID is allocated.
+	 *
+	 * (d) A new ASID is allocated.
+	 */
+
+	alloc_asid = false;
+	eptgen = pmap->pm_eptgen;
+	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
+
+	if (vcpustate->asid.gen != asid[thiscpu].gen) {
+		alloc_asid = true;	/* (c) and (d) */
+	} else if (vcpustate->eptgen != eptgen) {
+		if (flush_by_asid())
+			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
+		else
+			alloc_asid = true;			/* (b2) */
+	} else {
+		/*
+		 * This is the common case (a).
+		 */
+		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
+		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
+		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
+	}
+
+	if (alloc_asid) {
+		if (++asid[thiscpu].num >= nasid) {
+			asid[thiscpu].num = 1;
+			if (++asid[thiscpu].gen == 0)
+				asid[thiscpu].gen = 1;
+			/*
+			 * If this cpu does not support "flush-by-asid"
+			 * then flush the entire TLB on a generation
+			 * bump. Subsequent ASID allocation in this
+			 * generation can be done without a TLB flush.
+			 */
+			if (!flush_by_asid())
+				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
+		}
+		vcpustate->asid.gen = asid[thiscpu].gen;
+		vcpustate->asid.num = asid[thiscpu].num;
+
+		ctrl->asid = vcpustate->asid.num;
+		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
+		/*
+		 * If this cpu supports "flush-by-asid" then the TLB
+		 * was not flushed after the generation bump. The TLB
+		 * is flushed selectively after every new ASID allocation.
+		 */
+		if (flush_by_asid())
+			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
+	}
+	vcpustate->eptgen = eptgen;
+
+	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
+	KASSERT(ctrl->asid == vcpustate->asid.num,
+	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
+}
+
+static __inline void
+disable_gintr(void)
+{
+
+        __asm __volatile("clgi" : : :);
+}
+
+static __inline void
+enable_gintr(void)
+{
+
+        __asm __volatile("stgi" : : :);
+}
+
+/*
+ * Start vcpu with specified RIP.
+ */
+static int
+svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
+	void *rend_cookie, void *suspended_cookie)
+{
+	struct svm_regctx *gctx;
+	struct svm_softc *svm_sc;
+	struct svm_vcpu *vcpustate;
+	struct vmcb_state *state;
+	struct vmcb_ctrl *ctrl;
+	struct vm_exit *vmexit;
+	struct vlapic *vlapic;
+	struct vm *vm;
+	uint64_t vmcb_pa;
+	u_int thiscpu;
+	int handled;
+
+	svm_sc = arg;
+	vm = svm_sc->vm;
+
+	vcpustate = svm_get_vcpu(svm_sc, vcpu);
+	state = svm_get_vmcb_state(svm_sc, vcpu);
+	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
+	vmexit = vm_exitinfo(vm, vcpu);
+	vlapic = vm_lapic(vm, vcpu);
+
+	/*
+	 * Stash 'curcpu' on the stack as 'thiscpu'.
+	 *
+	 * The per-cpu data area is not accessible until MSR_GSBASE is restored
+	 * after the #VMEXIT. Since VMRUN is executed inside a critical section
+	 * 'curcpu' and 'thiscpu' are guaranteed to identical.
+	 */
+	thiscpu = curcpu;
+
+	gctx = svm_get_guest_regctx(svm_sc, vcpu);
+	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
+
+	if (vcpustate->lastcpu != thiscpu) {
+		/*
+		 * Force new ASID allocation by invalidating the generation.
+		 */
+		vcpustate->asid.gen = 0;
+
+		/*
+		 * Invalidate the VMCB state cache by marking all fields dirty.
+		 */
+		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
+
+		/*
+		 * XXX
+		 * Setting 'vcpustate->lastcpu' here is bit premature because
+		 * we may return from this function without actually executing
+		 * the VMRUN  instruction. This could happen if a rendezvous
+		 * or an AST is pending on the first time through the loop.
+		 *
+		 * This works for now but any new side-effects of vcpu
+		 * migration should take this case into account.
+		 */
+		vcpustate->lastcpu = thiscpu;
+		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
+	}
+
+	svm_msr_guest_enter(svm_sc, vcpu);
+
+	/* Update Guest RIP */
+	state->rip = rip;
+
+	do {
+		/*
+		 * Disable global interrupts to guarantee atomicity during
+		 * loading of guest state. This includes not only the state
+		 * loaded by the "vmrun" instruction but also software state
+		 * maintained by the hypervisor: suspended and rendezvous
+		 * state, NPT generation number, vlapic interrupts etc.
+		 */
+		disable_gintr();
+
+		if (vcpu_suspended(suspended_cookie)) {
+			enable_gintr();
+			vm_exit_suspended(vm, vcpu, state->rip);
+			break;
+		}
+
+		if (vcpu_rendezvous_pending(rend_cookie)) {
+			enable_gintr();
+			vm_exit_rendezvous(vm, vcpu, state->rip);
+			break;
+		}
+
+		/* We are asked to give the cpu by scheduler. */
+		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+			enable_gintr();
+			vm_exit_astpending(vm, vcpu, state->rip);
+			break;
+		}
+
+		svm_inj_interrupts(svm_sc, vcpu, vlapic);
+
+		/* Activate the nested pmap on 'thiscpu' */
+		CPU_SET_ATOMIC_ACQ(thiscpu, &pmap->pm_active);
+
+		/*
+		 * Check the pmap generation and the ASID generation to
+		 * ensure that the vcpu does not use stale TLB mappings.
+		 */
+		check_asid(svm_sc, vcpu, pmap, thiscpu);
+
+		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
+		vcpustate->dirty = 0;
+		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
+
+		/* Launch Virtual Machine. */
+		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
+		svm_launch(vmcb_pa, gctx);
+
+		CPU_CLR_ATOMIC(thiscpu, &pmap->pm_active);
+
+		/*
+		 * Restore MSR_GSBASE to point to the pcpu data area.
+		 *
+		 * Note that accesses done via PCPU_GET/PCPU_SET will work
+		 * only after MSR_GSBASE is restored.
+		 *
+		 * Also note that we don't bother restoring MSR_KGSBASE
+		 * since it is not used in the kernel and will be restored
+		 * when the VMRUN ioctl returns to userspace.
+		 */
+		wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[thiscpu]);
+		KASSERT(curcpu == thiscpu, ("thiscpu/curcpu (%u/%u) mismatch",
+		    thiscpu, curcpu));
+
+		/*
+		 * The host GDTR and IDTR is saved by VMRUN and restored
+		 * automatically on #VMEXIT. However, the host TSS needs
+		 * to be restored explicitly.
+		 */
+		restore_host_tss();
+
+		/* #VMEXIT disables interrupts so re-enable them here. */ 
+		enable_gintr();
+
+		/* Handle #VMEXIT and if required return to user space. */
+		handled = svm_vmexit(svm_sc, vcpu, vmexit);
+	} while (handled);
+
+	svm_msr_guest_exit(svm_sc, vcpu);
+
+	return (0);
+}
+
+static void
+svm_vmcleanup(void *arg)
+{
+	struct svm_softc *sc = arg;
+
+	free(sc, M_SVM);
+}
+
+static register_t *
+swctx_regptr(struct svm_regctx *regctx, int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_RBX:
+		return (&regctx->sctx_rbx);
+	case VM_REG_GUEST_RCX:
+		return (&regctx->sctx_rcx);
+	case VM_REG_GUEST_RDX:
+		return (&regctx->sctx_rdx);
+	case VM_REG_GUEST_RDI:
+		return (&regctx->sctx_rdi);
+	case VM_REG_GUEST_RSI:
+		return (&regctx->sctx_rsi);
+	case VM_REG_GUEST_RBP:
+		return (&regctx->sctx_rbp);
+	case VM_REG_GUEST_R8:
+		return (&regctx->sctx_r8);
+	case VM_REG_GUEST_R9:
+		return (&regctx->sctx_r9);
+	case VM_REG_GUEST_R10:
+		return (&regctx->sctx_r10);
+	case VM_REG_GUEST_R11:
+		return (&regctx->sctx_r11);
+	case VM_REG_GUEST_R12:
+		return (&regctx->sctx_r12);
+	case VM_REG_GUEST_R13:
+		return (&regctx->sctx_r13);
+	case VM_REG_GUEST_R14:
+		return (&regctx->sctx_r14);
+	case VM_REG_GUEST_R15:
+		return (&regctx->sctx_r15);
+	default:
+		return (NULL);
+	}
+}
+
+static int
+svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
+{
+	struct svm_softc *svm_sc;
+	register_t *reg;
+
+	svm_sc = arg;
+
+	if (ident == VM_REG_GUEST_INTR_SHADOW) {
+		return (svm_get_intr_shadow(svm_sc, vcpu, val));
+	}
+
+	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
+		return (0);
+	}
+
+	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
+
+	if (reg != NULL) {
+		*val = *reg;
+		return (0);
+	}
+
+	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
+	return (EINVAL);
+}
+
+static int
+svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
+{
+	struct svm_softc *svm_sc;
+	register_t *reg;
+
+	svm_sc = arg;
+
+	if (ident == VM_REG_GUEST_INTR_SHADOW) {
+		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
+	}
+
+	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
+		return (0);
+	}
+
+	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
+
+	if (reg != NULL) {
+		*reg = val;
+		return (0);
+	}
+
+	/*
+	 * XXX deal with CR3 and invalidate TLB entries tagged with the
+	 * vcpu's ASID. This needs to be treated differently depending on
+	 * whether 'running' is true/false.
+	 */
+
+	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
+	return (EINVAL);
+}
+
+static int
+svm_setcap(void *arg, int vcpu, int type, int val)
+{
+	struct svm_softc *sc;
+	int error;
+
+	sc = arg;
+	error = 0;
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_HLT, val);
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_PAUSE, val);
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		/* Unrestricted guest execution cannot be disabled in SVM */
+		if (val == 0)
+			error = EINVAL;
+		break;
+	default:
+		error = ENOENT;
+		break;
+	}
+	return (error);
+}
+
+static int
+svm_getcap(void *arg, int vcpu, int type, int *retval)
+{
+	struct svm_softc *sc;
+	int error;
+
+	sc = arg;
+	error = 0;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_HLT);
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_PAUSE);
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		*retval = 1;	/* unrestricted guest is always enabled */
+		break;
+	default:
+		error = ENOENT;
+		break;
+	}
+	return (error);
+}
+
+static struct vlapic *
+svm_vlapic_init(void *arg, int vcpuid)
+{
+	struct svm_softc *svm_sc;
+	struct vlapic *vlapic;
+
+	svm_sc = arg;
+	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
+	vlapic->vm = svm_sc->vm;
+	vlapic->vcpuid = vcpuid;
+	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
+
+	vlapic_init(vlapic);
+
+	return (vlapic);
+}
+
+static void
+svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
+{
+
+        vlapic_cleanup(vlapic);
+        free(vlapic, M_SVM_VLAPIC);
+}
+
+struct vmm_ops vmm_ops_amd = {
+	svm_init,
+	svm_cleanup,
+	svm_restore,
+	svm_vminit,
+	svm_vmrun,
+	svm_vmcleanup,
+	svm_getreg,
+	svm_setreg,
+	vmcb_getdesc,
+	vmcb_setdesc,
+	svm_getcap,
+	svm_setcap,
+	svm_npt_alloc,
+	svm_npt_free,
+	svm_vlapic_init,
+	svm_vlapic_cleanup	
+};
diff --git a/sys/amd64/vmm/amd/svm.h b/sys/amd64/vmm/amd/svm.h
new file mode 100644
index 0000000..86bd638
--- /dev/null
+++ b/sys/amd64/vmm/amd/svm.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_H_
+#define _SVM_H_
+
+/*
+ * Guest register state that is saved outside the VMCB.
+ */
+struct svm_regctx {
+	register_t	sctx_rbp;
+	register_t	sctx_rbx;
+	register_t	sctx_rcx;
+	register_t	sctx_rdx;
+	register_t	sctx_rdi;
+	register_t	sctx_rsi;
+	register_t	sctx_r8;
+	register_t	sctx_r9;
+	register_t	sctx_r10;
+	register_t	sctx_r11;
+	register_t	sctx_r12;
+	register_t	sctx_r13;
+	register_t	sctx_r14;
+	register_t	sctx_r15;
+};
+
+void svm_launch(uint64_t pa, struct svm_regctx *);
+
+#endif /* _SVM_H_ */
diff --git a/sys/amd64/vmm/amd/svm_genassym.c b/sys/amd64/vmm/amd/svm_genassym.c
new file mode 100644
index 0000000..b7831eb
--- /dev/null
+++ b/sys/amd64/vmm/amd/svm_genassym.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/assym.h>
+
+#include "svm.h"
+
+ASSYM(SCTX_RBX, offsetof(struct svm_regctx, sctx_rbx));
+ASSYM(SCTX_RCX, offsetof(struct svm_regctx, sctx_rcx));
+ASSYM(SCTX_RBP, offsetof(struct svm_regctx, sctx_rbp));
+ASSYM(SCTX_RDX, offsetof(struct svm_regctx, sctx_rdx));
+ASSYM(SCTX_RDI, offsetof(struct svm_regctx, sctx_rdi));
+ASSYM(SCTX_RSI, offsetof(struct svm_regctx, sctx_rsi));
+ASSYM(SCTX_R8,  offsetof(struct svm_regctx, sctx_r8));
+ASSYM(SCTX_R9,  offsetof(struct svm_regctx, sctx_r9));
+ASSYM(SCTX_R10, offsetof(struct svm_regctx, sctx_r10));
+ASSYM(SCTX_R11, offsetof(struct svm_regctx, sctx_r11));
+ASSYM(SCTX_R12, offsetof(struct svm_regctx, sctx_r12));
+ASSYM(SCTX_R13, offsetof(struct svm_regctx, sctx_r13));
+ASSYM(SCTX_R14, offsetof(struct svm_regctx, sctx_r14));
+ASSYM(SCTX_R15, offsetof(struct svm_regctx, sctx_r15));
diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c
new file mode 100644
index 0000000..100af4b
--- /dev/null
+++ b/sys/amd64/vmm/amd/svm_msr.c
@@ -0,0 +1,136 @@
+/*-
+ * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+
+#include "svm_msr.h"
+
+#ifndef MSR_AMDK8_IPM
+#define	MSR_AMDK8_IPM	0xc0010055
+#endif
+
+enum {
+	IDX_MSR_LSTAR,
+	IDX_MSR_CSTAR,
+	IDX_MSR_STAR,
+	IDX_MSR_SF_MASK,
+	HOST_MSR_NUM		/* must be the last enumeration */
+};
+
+static uint64_t host_msrs[HOST_MSR_NUM];
+
+void
+svm_msr_init(void)
+{
+	/* 
+	 * It is safe to cache the values of the following MSRs because they
+	 * don't change based on curcpu, curproc or curthread.
+	 */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+}
+
+void
+svm_msr_guest_init(struct svm_softc *sc, int vcpu)
+{
+	/*
+	 * All the MSRs accessible to the guest are either saved/restored by
+	 * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored
+	 * by VMSAVE/VMLOAD (e.g., MSR_GSBASE).
+	 *
+	 * There are no guest MSRs that are saved/restored "by hand" so nothing
+	 * more to do here.
+	 */
+	return;
+}
+
+void
+svm_msr_guest_enter(struct svm_softc *sc, int vcpu)
+{
+	/*
+	 * Save host MSRs (if any) and restore guest MSRs (if any).
+	 */
+}
+
+void
+svm_msr_guest_exit(struct svm_softc *sc, int vcpu)
+{
+	/*
+	 * Save guest MSRs (if any) and restore host MSRs.
+	 */
+	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
+
+	/* MSR_KGSBASE will be restored on the way back to userspace */
+}
+
+int
+svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
+    bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	case MSR_AMDK8_IPM:
+		*result = 0;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+int
+svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	case MSR_AMDK8_IPM:
+		/*
+		 * Ignore writes to the "Interrupt Pending Message" MSR.
+		 */
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
diff --git a/sys/amd64/vmm/amd/svm_msr.h b/sys/amd64/vmm/amd/svm_msr.h
new file mode 100644
index 0000000..07716c8
--- /dev/null
+++ b/sys/amd64/vmm/amd/svm_msr.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_MSR_H_
+#define	_SVM_MSR_H_
+
+struct svm_softc;
+
+void svm_msr_init(void);
+void svm_msr_guest_init(struct svm_softc *sc, int vcpu);
+void svm_msr_guest_enter(struct svm_softc *sc, int vcpu);
+void svm_msr_guest_exit(struct svm_softc *sc, int vcpu);
+
+int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
+    bool *retu);
+int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
+    bool *retu);
+
+#endif	/* _SVM_MSR_H_ */
diff --git a/sys/amd64/vmm/amd/svm_softc.h b/sys/amd64/vmm/amd/svm_softc.h
new file mode 100644
index 0000000..a5bb57c
--- /dev/null
+++ b/sys/amd64/vmm/amd/svm_softc.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_SOFTC_H_
+#define _SVM_SOFTC_H_
+
+#define SVM_IO_BITMAP_SIZE	(3 * PAGE_SIZE)
+#define SVM_MSR_BITMAP_SIZE	(2 * PAGE_SIZE)
+
+struct asid {
+	uint64_t	gen;	/* range is [1, ~0UL] */
+	uint32_t	num;	/* range is [1, nasid - 1] */
+};
+
+/*
+ * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space
+ * due to VMCB alignment requirements.
+ */
+struct svm_vcpu {
+	struct vmcb	vmcb;	 /* hardware saved vcpu context */
+	struct svm_regctx swctx; /* software saved vcpu context */
+	uint64_t	vmcb_pa; /* VMCB physical address */
+        int		lastcpu; /* host cpu that the vcpu last ran on */
+	uint32_t	dirty;	 /* state cache bits that must be cleared */
+	long		eptgen;	 /* pmap->pm_eptgen when the vcpu last ran */
+	struct asid	asid;
+} __aligned(PAGE_SIZE);
+
+/*
+ * SVM softc, one per virtual machine.
+ */
+struct svm_softc {
+	uint8_t iopm_bitmap[SVM_IO_BITMAP_SIZE];    /* shared by all vcpus */
+	uint8_t msr_bitmap[SVM_MSR_BITMAP_SIZE];    /* shared by all vcpus */
+	uint8_t apic_page[VM_MAXCPU][PAGE_SIZE];
+	struct svm_vcpu vcpu[VM_MAXCPU];
+	vm_offset_t 	nptp;			    /* nested page table */
+	struct vm	*vm;
+} __aligned(PAGE_SIZE);
+
+CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0);
+
+static __inline struct svm_vcpu *
+svm_get_vcpu(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu]));
+}
+
+static __inline struct vmcb *
+svm_get_vmcb(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].vmcb));
+}
+
+static __inline struct vmcb_state *
+svm_get_vmcb_state(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].vmcb.state));
+}
+
+static __inline struct vmcb_ctrl *
+svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].vmcb.ctrl));
+}
+
+static __inline struct svm_regctx *
+svm_get_guest_regctx(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].swctx));
+}
+
+static __inline void
+svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits)
+{
+        struct svm_vcpu *vcpustate;
+
+        vcpustate = svm_get_vcpu(sc, vcpu);
+
+        vcpustate->dirty |= dirtybits;
+}
+
+#endif /* _SVM_SOFTC_H_ */
diff --git a/sys/amd64/vmm/amd/svm_support.S b/sys/amd64/vmm/amd/svm_support.S
new file mode 100644
index 0000000..72327bd
--- /dev/null
+++ b/sys/amd64/vmm/amd/svm_support.S
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <machine/asmacros.h>
+
+#include "svm_assym.h"
+
+/*
+ * Be friendly to DTrace FBT's prologue/epilogue pattern matching.
+ *
+ * They are also responsible for saving/restoring the host %rbp across VMRUN.
+ */
+#define	VENTER  push %rbp ; mov %rsp,%rbp
+#define	VLEAVE  pop %rbp
+
+/*
+ * svm_launch(uint64_t vmcb, struct svm_regctx *gctx)
+ * %rdi: physical address of VMCB
+ * %rsi: pointer to guest context
+ */
+ENTRY(svm_launch)
+	VENTER
+
+	/*
+	 * Host register state saved across a VMRUN.
+	 *
+	 * All "callee saved registers" except:
+	 * %rsp: because it is preserved by the processor across VMRUN.
+	 * %rbp: because it is saved/restored by the function prologue/epilogue.
+	 */
+	push %rbx
+	push %r12
+	push %r13
+	push %r14
+	push %r15
+
+	/* Save the physical address of the VMCB in %rax */
+	movq %rdi, %rax
+
+	push %rsi		/* push guest context pointer on the stack */
+
+	/*
+	 * Restore guest state.
+	 */
+	movq SCTX_R8(%rsi), %r8
+	movq SCTX_R9(%rsi), %r9
+	movq SCTX_R10(%rsi), %r10
+	movq SCTX_R11(%rsi), %r11
+	movq SCTX_R12(%rsi), %r12
+	movq SCTX_R13(%rsi), %r13
+	movq SCTX_R14(%rsi), %r14
+	movq SCTX_R15(%rsi), %r15
+	movq SCTX_RBP(%rsi), %rbp
+	movq SCTX_RBX(%rsi), %rbx
+	movq SCTX_RCX(%rsi), %rcx
+	movq SCTX_RDX(%rsi), %rdx
+	movq SCTX_RDI(%rsi), %rdi
+	movq SCTX_RSI(%rsi), %rsi	/* %rsi must be restored last */
+
+	vmload %rax
+	vmrun %rax
+	vmsave %rax
+
+	pop %rax		/* pop guest context pointer from the stack */
+
+	/*
+	 * Save guest state.
+	 */
+	movq %r8, SCTX_R8(%rax)
+	movq %r9, SCTX_R9(%rax)
+	movq %r10, SCTX_R10(%rax)
+	movq %r11, SCTX_R11(%rax)
+	movq %r12, SCTX_R12(%rax)
+	movq %r13, SCTX_R13(%rax)
+	movq %r14, SCTX_R14(%rax)
+	movq %r15, SCTX_R15(%rax)
+	movq %rbp, SCTX_RBP(%rax)
+	movq %rbx, SCTX_RBX(%rax)
+	movq %rcx, SCTX_RCX(%rax)
+	movq %rdx, SCTX_RDX(%rax)
+	movq %rdi, SCTX_RDI(%rax)
+	movq %rsi, SCTX_RSI(%rax)
+
+	/* Restore host state */
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+	pop %rbx
+
+	VLEAVE
+	ret
+END(svm_launch)
diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c
new file mode 100644
index 0000000..fb4b2c8
--- /dev/null
+++ b/sys/amd64/vmm/amd/vmcb.c
@@ -0,0 +1,443 @@
+/*-
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+
+#include "vmcb.h"
+#include "svm.h"
+#include "svm_softc.h"
+
+/*
+ * The VMCB aka Virtual Machine Control Block is a 4KB aligned page
+ * in memory that describes the virtual machine.
+ *
+ * The VMCB contains:
+ * - instructions or events in the guest to intercept
+ * - control bits that modify execution environment of the guest
+ * - guest processor state (e.g. general purpose registers)
+ */
+
+/*
+ * Return VMCB segment area.
+ */
+static struct vmcb_segment *
+vmcb_segptr(struct vmcb *vmcb, int type)
+{
+	struct vmcb_state *state;
+	struct vmcb_segment *seg;
+
+	state = &vmcb->state;
+
+	switch (type) {
+	case VM_REG_GUEST_CS:
+		seg = &state->cs;
+		break;
+
+	case VM_REG_GUEST_DS:
+		seg = &state->ds;
+		break;
+
+	case VM_REG_GUEST_ES:
+		seg = &state->es;
+		break;
+
+	case VM_REG_GUEST_FS:
+		seg = &state->fs;
+		break;
+
+	case VM_REG_GUEST_GS:
+		seg = &state->gs;
+		break;
+
+	case VM_REG_GUEST_SS:
+		seg = &state->ss;
+		break;
+
+	case VM_REG_GUEST_GDTR:
+		seg = &state->gdt;
+		break;
+
+	case VM_REG_GUEST_IDTR:
+		seg = &state->idt;
+		break;
+
+	case VM_REG_GUEST_LDTR:
+		seg = &state->ldt;
+		break;
+
+	case VM_REG_GUEST_TR:
+		seg = &state->tr;
+		break;
+
+	default:
+		seg = NULL;
+		break;
+	}
+
+	return (seg);
+}
+
+static int
+vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident,
+	uint64_t *val)
+{
+	struct vmcb *vmcb;
+	int off, bytes;
+	char *ptr;
+
+	vmcb	= svm_get_vmcb(softc, vcpu);
+	off	= VMCB_ACCESS_OFFSET(ident);
+	bytes	= VMCB_ACCESS_BYTES(ident);
+
+	if ((off + bytes) >= sizeof (struct vmcb))
+		return (EINVAL);
+
+	ptr = (char *)vmcb;
+
+	if (!write)
+		*val = 0;
+
+	switch (bytes) {
+	case 8:
+	case 4:
+	case 2:
+		if (write)
+			memcpy(ptr + off, val, bytes);
+		else
+			memcpy(val, ptr + off, bytes);
+		break;
+	default:
+		VCPU_CTR1(softc->vm, vcpu,
+		    "Invalid size %d for VMCB access: %d", bytes);
+		return (EINVAL);
+	}
+
+	/* Invalidate all VMCB state cached by h/w. */
+	if (write)
+		svm_set_dirty(softc, vcpu, 0xffffffff);
+
+	return (0);
+}
+
+/*
+ * Read from segment selector, control and general purpose register of VMCB.
+ */
+int
+vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval)
+{
+	struct vmcb *vmcb;
+	struct vmcb_state *state;
+	struct vmcb_segment *seg;
+	int err;
+
+	vmcb = svm_get_vmcb(sc, vcpu);
+	state = &vmcb->state;
+	err = 0;
+
+	if (VMCB_ACCESS_OK(ident))
+		return (vmcb_access(sc, vcpu, 0, ident, retval));
+
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		*retval = state->cr0;
+		break;
+
+	case VM_REG_GUEST_CR2:
+		*retval = state->cr2;
+		break;
+
+	case VM_REG_GUEST_CR3:
+		*retval = state->cr3;
+		break;
+
+	case VM_REG_GUEST_CR4:
+		*retval = state->cr4;
+		break;
+
+	case VM_REG_GUEST_DR7:
+		*retval = state->dr7;
+		break;
+
+	case VM_REG_GUEST_EFER:
+		*retval = state->efer;
+		break;
+
+	case VM_REG_GUEST_RAX:
+		*retval = state->rax;
+		break;
+
+	case VM_REG_GUEST_RFLAGS:
+		*retval = state->rflags;
+		break;
+
+	case VM_REG_GUEST_RIP:
+		*retval = state->rip;
+		break;
+
+	case VM_REG_GUEST_RSP:
+		*retval = state->rsp;
+		break;
+
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_SS:
+	case VM_REG_GUEST_LDTR:
+	case VM_REG_GUEST_TR:
+		seg = vmcb_segptr(vmcb, ident);
+		KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
+		    __func__, ident));
+		*retval = seg->selector;
+		break;
+
+	case VM_REG_GUEST_GDTR:
+	case VM_REG_GUEST_IDTR:
+		/* GDTR and IDTR don't have segment selectors */
+		err = EINVAL;
+		break;
+	default:
+		err =  EINVAL;
+		break;
+	}
+
+	return (err);
+}
+
+/*
+ * Write to segment selector, control and general purpose register of VMCB.
+ */
+int
+vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
+{
+	struct vmcb *vmcb;
+	struct vmcb_state *state;
+	struct vmcb_segment *seg;
+	int err, dirtyseg;
+
+	vmcb = svm_get_vmcb(sc, vcpu);
+	state = &vmcb->state;
+	dirtyseg = 0;
+	err = 0;
+
+	if (VMCB_ACCESS_OK(ident))
+		return (vmcb_access(sc, vcpu, 1, ident, &val));
+
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		state->cr0 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_CR2:
+		state->cr2 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2);
+		break;
+
+	case VM_REG_GUEST_CR3:
+		state->cr3 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_CR4:
+		state->cr4 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_DR7:
+		state->dr7 = val;
+		break;
+
+	case VM_REG_GUEST_EFER:
+		/* EFER_SVM must always be set when the guest is executing */
+		state->efer = val | EFER_SVM;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_RAX:
+		state->rax = val;
+		break;
+
+	case VM_REG_GUEST_RFLAGS:
+		state->rflags = val;
+		break;
+
+	case VM_REG_GUEST_RIP:
+		state->rip = val;
+		break;
+
+	case VM_REG_GUEST_RSP:
+		state->rsp = val;
+		break;
+
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_SS:
+		dirtyseg = 1;		/* FALLTHROUGH */
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_LDTR:
+	case VM_REG_GUEST_TR:
+		seg = vmcb_segptr(vmcb, ident);
+		KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
+		    __func__, ident));
+		seg->selector = val;
+		if (dirtyseg)
+			svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
+		break;
+
+	case VM_REG_GUEST_GDTR:
+	case VM_REG_GUEST_IDTR:
+		/* GDTR and IDTR don't have segment selectors */
+		err = EINVAL;
+		break;
+	default:
+		err = EINVAL;
+		break;
+	}
+
+	return (err);
+}
+
+int
+vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2)
+{
+	struct vmcb_segment *seg;
+
+	seg = vmcb_segptr(vmcb, ident);
+	if (seg != NULL) {
+		bcopy(seg, seg2, sizeof(struct vmcb_segment));
+		return (0);
+	} else {
+		return (EINVAL);
+	}
+}
+
+int
+vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmcb *vmcb;
+	struct svm_softc *sc;
+	struct vmcb_segment *seg;
+	uint16_t attrib;
+
+	sc = arg;
+	vmcb = svm_get_vmcb(sc, vcpu);
+
+	seg = vmcb_segptr(vmcb, reg);
+	KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
+	    __func__, reg));
+
+	seg->base = desc->base;
+	seg->limit = desc->limit;
+	if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
+		/*
+		 * Map seg_desc access to VMCB attribute format.
+		 *
+		 * SVM uses the 'P' bit in the segment attributes to indicate a
+		 * NULL segment so clear it if the segment is marked unusable.
+		 */
+		attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
+		if (SEG_DESC_UNUSABLE(desc->access)) {
+			attrib &= ~0x80;
+		}
+		seg->attrib = attrib;
+	}
+
+	VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), "
+	    "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib);
+
+	switch (reg) {
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_SS:
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
+		break;
+	case VM_REG_GUEST_GDTR:
+	case VM_REG_GUEST_IDTR:
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+int
+vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmcb *vmcb;
+	struct svm_softc *sc;
+	struct vmcb_segment *seg;
+
+	sc = arg;
+	vmcb = svm_get_vmcb(sc, vcpu);
+	seg = vmcb_segptr(vmcb, reg);
+	KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
+	    __func__, reg));
+
+	desc->base = seg->base;
+	desc->limit = seg->limit;
+	desc->access = 0;
+
+	if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
+		/* Map seg_desc access to VMCB attribute format */
+		desc->access = ((seg->attrib & 0xF00) << 4) |
+		    (seg->attrib & 0xFF);
+
+		/*
+		 * VT-x uses bit 16 to indicate a segment that has been loaded
+		 * with a NULL selector (aka unusable). The 'desc->access'
+		 * field is interpreted in the VT-x format by the
+		 * processor-independent code.
+		 *
+		 * SVM uses the 'P' bit to convey the same information so
+		 * convert it into the VT-x format. For more details refer to
+		 * section "Segment State in the VMCB" in APMv2.
+		 */
+		if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) {
+			if ((desc->access & 0x80) == 0)
+				desc->access |= 0x10000;  /* Unusable segment */
+		}
+	}
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/amd/vmcb.h b/sys/amd64/vmm/amd/vmcb.h
new file mode 100644
index 0000000..496f880
--- /dev/null
+++ b/sys/amd64/vmm/amd/vmcb.h
@@ -0,0 +1,334 @@
+/*-
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCB_H_
+#define	_VMCB_H_
+
+struct svm_softc;
+
+#define BIT(n)			(1ULL << n)
+
+/*
+ * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15
+ * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B
+ */
+
+/* vmcb_ctrl->intercept[] array indices */
+#define	VMCB_CR_INTCPT		0
+#define	VMCB_DR_INTCPT		1
+#define	VMCB_EXC_INTCPT		2
+#define	VMCB_CTRL1_INTCPT	3
+#define	VMCB_CTRL2_INTCPT	4
+
+/* intercept[VMCB_CTRL1_INTCPT] fields */
+#define	VMCB_INTCPT_INTR		BIT(0)
+#define	VMCB_INTCPT_NMI			BIT(1)
+#define	VMCB_INTCPT_SMI			BIT(2)
+#define	VMCB_INTCPT_INIT		BIT(3)
+#define	VMCB_INTCPT_VINTR		BIT(4)
+#define	VMCB_INTCPT_CR0_WRITE		BIT(5)
+#define	VMCB_INTCPT_IDTR_READ		BIT(6)
+#define	VMCB_INTCPT_GDTR_READ		BIT(7)
+#define	VMCB_INTCPT_LDTR_READ		BIT(8)
+#define	VMCB_INTCPT_TR_READ		BIT(9)
+#define	VMCB_INTCPT_IDTR_WRITE		BIT(10)
+#define	VMCB_INTCPT_GDTR_WRITE		BIT(11)
+#define	VMCB_INTCPT_LDTR_WRITE		BIT(12)
+#define	VMCB_INTCPT_TR_WRITE		BIT(13)
+#define	VMCB_INTCPT_RDTSC		BIT(14)
+#define	VMCB_INTCPT_RDPMC		BIT(15)
+#define	VMCB_INTCPT_PUSHF		BIT(16)
+#define	VMCB_INTCPT_POPF		BIT(17)
+#define	VMCB_INTCPT_CPUID		BIT(18)
+#define	VMCB_INTCPT_RSM			BIT(19)
+#define	VMCB_INTCPT_IRET		BIT(20)
+#define	VMCB_INTCPT_INTn		BIT(21)
+#define	VMCB_INTCPT_INVD		BIT(22)
+#define	VMCB_INTCPT_PAUSE		BIT(23)
+#define	VMCB_INTCPT_HLT			BIT(24)
+#define	VMCB_INTCPT_INVPG		BIT(25)
+#define	VMCB_INTCPT_INVPGA		BIT(26)
+#define	VMCB_INTCPT_IO			BIT(27)
+#define	VMCB_INTCPT_MSR			BIT(28)
+#define	VMCB_INTCPT_TASK_SWITCH		BIT(29)
+#define	VMCB_INTCPT_FERR_FREEZE		BIT(30)
+#define	VMCB_INTCPT_SHUTDOWN		BIT(31)
+
+/* intercept[VMCB_CTRL2_INTCPT] fields */
+#define	VMCB_INTCPT_VMRUN		BIT(0)
+#define	VMCB_INTCPT_VMMCALL		BIT(1)
+#define	VMCB_INTCPT_VMLOAD		BIT(2)
+#define	VMCB_INTCPT_VMSAVE		BIT(3)
+#define	VMCB_INTCPT_STGI		BIT(4)
+#define	VMCB_INTCPT_CLGI		BIT(5)
+#define	VMCB_INTCPT_SKINIT		BIT(6)
+#define	VMCB_INTCPT_RDTSCP		BIT(7)
+#define	VMCB_INTCPT_ICEBP		BIT(8)
+#define	VMCB_INTCPT_WBINVD		BIT(9)
+#define	VMCB_INTCPT_MONITOR		BIT(10)
+#define	VMCB_INTCPT_MWAIT		BIT(11)
+#define	VMCB_INTCPT_MWAIT_ARMED		BIT(12)
+#define	VMCB_INTCPT_XSETBV		BIT(13)
+
+/* VMCB TLB control */
+#define	VMCB_TLB_FLUSH_NOTHING		0	/* Flush nothing */
+#define	VMCB_TLB_FLUSH_ALL		1	/* Flush entire TLB */
+#define	VMCB_TLB_FLUSH_GUEST		3	/* Flush all guest entries */
+#define	VMCB_TLB_FLUSH_GUEST_NONGLOBAL	7	/* Flush guest non-PG entries */
+
+/* VMCB state caching */
+#define	VMCB_CACHE_NONE		0	/* No caching */
+#define	VMCB_CACHE_I		BIT(0)	/* Intercept, TSC off, Pause filter */
+#define	VMCB_CACHE_IOPM		BIT(1)	/* I/O and MSR permission */
+#define	VMCB_CACHE_ASID		BIT(2)	/* ASID */
+#define	VMCB_CACHE_TPR		BIT(3)	/* V_TPR to V_INTR_VECTOR */
+#define	VMCB_CACHE_NP		BIT(4)	/* Nested Paging */
+#define	VMCB_CACHE_CR		BIT(5)	/* CR0, CR3, CR4 & EFER */
+#define	VMCB_CACHE_DR		BIT(6)	/* Debug registers */
+#define	VMCB_CACHE_DT		BIT(7)	/* GDT/IDT */
+#define	VMCB_CACHE_SEG		BIT(8)	/* User segments, CPL */
+#define	VMCB_CACHE_CR2		BIT(9)	/* page fault address */
+#define	VMCB_CACHE_LBR		BIT(10)	/* Last branch */
+
+/* VMCB control event injection */
+#define	VMCB_EVENTINJ_EC_VALID		BIT(11)	/* Error Code valid */
+#define	VMCB_EVENTINJ_VALID		BIT(31)	/* Event valid */
+
+/* Event types that can be injected */
+#define	VMCB_EVENTINJ_TYPE_INTR		0
+#define	VMCB_EVENTINJ_TYPE_NMI		2
+#define	VMCB_EVENTINJ_TYPE_EXCEPTION	3
+#define	VMCB_EVENTINJ_TYPE_INTn		4
+
+/* VMCB exit code, APM vol2 Appendix C */
+#define	VMCB_EXIT_MC			0x52
+#define	VMCB_EXIT_INTR			0x60
+#define	VMCB_EXIT_NMI			0x61
+#define	VMCB_EXIT_VINTR			0x64
+#define	VMCB_EXIT_PUSHF			0x70
+#define	VMCB_EXIT_POPF			0x71
+#define	VMCB_EXIT_CPUID			0x72
+#define	VMCB_EXIT_IRET			0x74
+#define	VMCB_EXIT_PAUSE			0x77
+#define	VMCB_EXIT_HLT			0x78
+#define	VMCB_EXIT_IO			0x7B
+#define	VMCB_EXIT_MSR			0x7C
+#define	VMCB_EXIT_SHUTDOWN		0x7F
+#define	VMCB_EXIT_VMSAVE		0x83
+#define	VMCB_EXIT_MONITOR		0x8A
+#define	VMCB_EXIT_MWAIT			0x8B
+#define	VMCB_EXIT_NPF			0x400
+#define	VMCB_EXIT_INVALID		-1
+
+/*
+ * Nested page fault.
+ * Bit definitions to decode EXITINFO1.
+ */
+#define	VMCB_NPF_INFO1_P		BIT(0) /* Nested page present. */
+#define	VMCB_NPF_INFO1_W		BIT(1) /* Access was write. */
+#define	VMCB_NPF_INFO1_U		BIT(2) /* Access was user access. */
+#define	VMCB_NPF_INFO1_RSV		BIT(3) /* Reserved bits present. */
+#define	VMCB_NPF_INFO1_ID		BIT(4) /* Code read. */
+
+#define	VMCB_NPF_INFO1_GPA		BIT(32) /* Guest physical address. */
+#define	VMCB_NPF_INFO1_GPT		BIT(33) /* Guest page table. */
+
+/*
+ * EXITINTINFO, Interrupt exit info for all intrecepts.
+ * Section 15.7.2, Intercepts during IDT Interrupt Delivery.
+ */
+#define VMCB_EXITINTINFO_VECTOR(x)	((x) & 0xFF)
+#define VMCB_EXITINTINFO_TYPE(x)	(((x) >> 8) & 0x7)
+#define VMCB_EXITINTINFO_EC_VALID(x)	(((x) & BIT(11)) ? 1 : 0)
+#define VMCB_EXITINTINFO_VALID(x)	(((x) & BIT(31)) ? 1 : 0)
+#define VMCB_EXITINTINFO_EC(x)		(((x) >> 32) & 0xFFFFFFFF)
+
+/* Offset of various VMCB fields. */
+#define	VMCB_OFF_CTRL(x)		(x)
+#define	VMCB_OFF_STATE(x)		((x) + 0x400)
+
+#define	VMCB_OFF_CR_INTERCEPT		VMCB_OFF_CTRL(0x0)
+#define	VMCB_OFF_DR_INTERCEPT		VMCB_OFF_CTRL(0x4)
+#define	VMCB_OFF_EXC_INTERCEPT		VMCB_OFF_CTRL(0x8)
+#define	VMCB_OFF_INST1_INTERCEPT	VMCB_OFF_CTRL(0xC)
+#define	VMCB_OFF_INST2_INTERCEPT	VMCB_OFF_CTRL(0x10)
+#define	VMCB_OFF_IO_PERM		VMCB_OFF_CTRL(0x40)
+#define	VMCB_OFF_MSR_PERM		VMCB_OFF_CTRL(0x48)
+#define	VMCB_OFF_TSC_OFFSET		VMCB_OFF_CTRL(0x50)
+#define	VMCB_OFF_ASID			VMCB_OFF_CTRL(0x58)
+#define	VMCB_OFF_TLB_CTRL		VMCB_OFF_CTRL(0x5C)
+#define	VMCB_OFF_VIRQ			VMCB_OFF_CTRL(0x60)
+#define	VMCB_OFF_EXIT_REASON		VMCB_OFF_CTRL(0x70)
+#define	VMCB_OFF_EXITINFO1		VMCB_OFF_CTRL(0x78)
+#define	VMCB_OFF_EXITINFO2		VMCB_OFF_CTRL(0x80)
+#define	VMCB_OFF_EXITINTINFO		VMCB_OFF_CTRL(0x88)
+#define	VMCB_OFF_AVIC_BAR		VMCB_OFF_CTRL(0x98)
+#define	VMCB_OFF_NPT_BASE		VMCB_OFF_CTRL(0xB0)
+#define	VMCB_OFF_AVIC_PAGE		VMCB_OFF_CTRL(0xE0)
+#define	VMCB_OFF_AVIC_LT		VMCB_OFF_CTRL(0xF0)
+#define	VMCB_OFF_AVIC_PT		VMCB_OFF_CTRL(0xF8)
+#define	VMCB_OFF_SYSENTER_CS		VMCB_OFF_STATE(0x228)
+#define	VMCB_OFF_SYSENTER_ESP		VMCB_OFF_STATE(0x230)
+#define	VMCB_OFF_SYSENTER_EIP		VMCB_OFF_STATE(0x238)
+#define	VMCB_OFF_GUEST_PAT		VMCB_OFF_STATE(0x268)
+
+/*
+ * Encode the VMCB offset and bytes that we want to read from VMCB.
+ */
+#define	VMCB_ACCESS(o, w)		(0x80000000 | (((w) & 0xF) << 16) | \
+					((o) & 0xFFF))
+#define	VMCB_ACCESS_OK(v)               ((v) & 0x80000000 )
+#define	VMCB_ACCESS_BYTES(v)            (((v) >> 16) & 0xF)
+#define	VMCB_ACCESS_OFFSET(v)           ((v) & 0xFFF)
+
+#ifdef _KERNEL
+/* VMCB save state area segment format */
+struct vmcb_segment {
+	uint16_t	selector;
+	uint16_t	attrib;
+	uint32_t	limit;
+	uint64_t	base;
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb_segment) == 16);
+
+/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */
+#define	VMCB_CS_ATTRIB_L		BIT(9)	/* Long mode. */
+#define	VMCB_CS_ATTRIB_D		BIT(10)	/* OPerand size bit. */
+
+/*
+ * The VMCB is divided into two areas - the first one contains various
+ * control bits including the intercept vector and the second one contains
+ * the guest state.
+ */
+
+/* VMCB control area - padded up to 1024 bytes */
+struct vmcb_ctrl {
+	uint32_t intercept[5];	/* all intercepts */
+	uint8_t	 pad1[0x28];	/* Offsets 0x14-0x3B are reserved. */
+	uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */
+	uint16_t pause_filcnt;  /* Offset 0x3E, PAUSE filter count */
+	uint64_t iopm_base_pa;	/* 0x40: IOPM_BASE_PA */
+	uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */
+	uint64_t tsc_offset;	/* 0x50: TSC_OFFSET */
+	uint32_t asid;		/* 0x58: Guest ASID */
+	uint8_t	 tlb_ctrl;	/* 0x5C: TLB_CONTROL */
+	uint8_t  pad2[3];	/* 0x5D-0x5F: Reserved. */
+	uint8_t	 v_tpr;		/* 0x60: V_TPR, guest CR8 */
+	uint8_t	 v_irq:1;	/* Is virtual interrupt pending? */
+	uint8_t	:7; 		/* Padding */
+	uint8_t v_intr_prio:4;	/* 0x62: Priority for virtual interrupt. */
+	uint8_t v_ign_tpr:1;
+	uint8_t :3;
+	uint8_t	v_intr_masking:1; /* Guest and host sharing of RFLAGS. */
+	uint8_t	:7;
+	uint8_t	v_intr_vector;	/* 0x65: Vector for virtual interrupt. */
+	uint8_t pad3[3];	/* Bit64-40 Reserved. */
+	uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */
+	uint64_t :63;
+	uint64_t exitcode;	/* 0x70, Exitcode */
+	uint64_t exitinfo1;	/* 0x78, EXITINFO1 */
+	uint64_t exitinfo2;	/* 0x80, EXITINFO2 */
+	uint64_t exitintinfo;	/* 0x88, Interrupt exit value. */
+	uint64_t np_enable:1;   /* 0x90, Nested paging enable. */
+	uint64_t :63;
+	uint8_t  pad4[0x10];	/* 0x98-0xA7 reserved. */
+	uint64_t eventinj;	/* 0xA8, Event injection. */
+	uint64_t n_cr3;		/* B0, Nested page table. */
+	uint64_t lbr_virt_en:1;	/* Enable LBR virtualization. */
+	uint64_t :63;
+	uint32_t vmcb_clean;	/* 0xC0: VMCB clean bits for caching */
+	uint32_t :32;		/* 0xC4: Reserved */
+	uint64_t nrip;		/* 0xC8: Guest next nRIP. */
+	uint8_t	inst_len;	/* 0xD0: #NPF decode assist */
+	uint8_t	inst_bytes[15];
+	uint8_t	padd6[0x320];
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
+
+struct vmcb_state {
+	struct   vmcb_segment es;
+	struct   vmcb_segment cs;
+	struct   vmcb_segment ss;
+	struct   vmcb_segment ds;
+	struct   vmcb_segment fs;
+	struct   vmcb_segment gs;
+	struct   vmcb_segment gdt;
+	struct   vmcb_segment ldt;
+	struct   vmcb_segment idt;
+	struct   vmcb_segment tr;
+	uint8_t	 pad1[0x2b];		/* Reserved: 0xA0-0xCA */
+	uint8_t	 cpl;
+	uint8_t  pad2[4];
+	uint64_t efer;
+	uint8_t	 pad3[0x70];		/* Reserved: 0xd8-0x147 */
+	uint64_t cr4;
+	uint64_t cr3;			/* Guest CR3 */
+	uint64_t cr0;
+	uint64_t dr7;
+	uint64_t dr6;
+	uint64_t rflags;
+	uint64_t rip;
+	uint8_t	 pad4[0x58]; 		/* Reserved: 0x180-0x1D7 */
+	uint64_t rsp;
+	uint8_t	 pad5[0x18]; 		/* Reserved 0x1E0-0x1F7 */
+	uint64_t rax;
+	uint64_t star;
+	uint64_t lstar;
+	uint64_t cstar;
+	uint64_t sfmask;
+	uint64_t kernelgsbase;
+	uint64_t sysenter_cs;
+	uint64_t sysenter_esp;
+	uint64_t sysenter_eip;
+	uint64_t cr2;
+	uint8_t	 pad6[0x20];
+	uint64_t g_pat;
+	uint64_t dbgctl;
+	uint64_t br_from;
+	uint64_t br_to;
+	uint64_t int_from;
+	uint64_t int_to;
+	uint8_t	 pad7[0x968];		/* Reserved upto end of VMCB */
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb_state) == 0xC00);
+
+struct vmcb {
+	struct vmcb_ctrl ctrl;
+	struct vmcb_state state;
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
+CTASSERT(offsetof(struct vmcb, state) == 0x400);
+
+int	vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval);
+int	vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val);
+int	vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
+int	vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
+int	vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg);
+
+#endif /* _KERNEL */
+#endif /* _VMCB_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 51e5c2c..ae4d9db 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -332,7 +332,6 @@ vmcs_init(struct vmcs *vmcs)
 	int error, codesel, datasel, tsssel;
 	u_long cr0, cr4, efer;
 	uint64_t pat, fsbase, idtrbase;
-	uint32_t exc_bitmap;
 
 	codesel = vmm_get_host_codesel();
 	datasel = vmm_get_host_datasel();
@@ -417,11 +416,6 @@ vmcs_init(struct vmcs *vmcs)
 	if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest)) != 0)
 		goto done;
 
-	/* exception bitmap */
-	exc_bitmap = 1 << IDT_MC;
-	if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
-		goto done;
-
 	/* link pointer */
 	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
 		goto done;
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 6122de5..6d78a69 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -321,7 +321,7 @@ vmcs_write(uint32_t encoding, uint64_t val)
 #define EXIT_REASON_MTF			37
 #define EXIT_REASON_MONITOR		39
 #define EXIT_REASON_PAUSE		40
-#define EXIT_REASON_MCE			41
+#define EXIT_REASON_MCE_DURING_ENTRY	41
 #define EXIT_REASON_TPR			43
 #define EXIT_REASON_APIC_ACCESS		44
 #define	EXIT_REASON_VIRTUALIZED_EOI	45
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 2fe5a27..c3dd04e 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -283,8 +283,8 @@ exit_reason_to_str(int reason)
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
-	case EXIT_REASON_MCE:
-		return "mce";
+	case EXIT_REASON_MCE_DURING_ENTRY:
+		return "mce-during-entry";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
@@ -821,6 +821,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
+	uint32_t exc_bitmap;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
@@ -911,6 +912,14 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
+
+		/* exception bitmap */
+		if (vcpu_trace_exceptions(vm, i))
+			exc_bitmap = 0xffffffff;
+		else
+			exc_bitmap = 1 << IDT_MC;
+		error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
+
 		if (virtual_interrupt_delivery) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_VIRTUAL_APIC,
@@ -1746,8 +1755,6 @@ inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
 
 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
-
-	/* XXX modify svm.c to update bit 16 of seg_desc.access (unusable) */
 }
 
 static void
@@ -1781,6 +1788,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}
+	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
 }
 
 static int
@@ -2057,8 +2065,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
+	struct vm_exception vmexc;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
-	uint32_t intr_type, reason;
+	uint32_t intr_type, intr_vec, reason;
 	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
@@ -2075,6 +2084,18 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
+	 * VM-entry failures during or after loading guest state.
+	 *
+	 * These VM-exits are uncommon but must be handled specially
+	 * as most VM-exit fields are not populated as usual.
+	 */
+	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
+		VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
+		__asm __volatile("int $18");
+		return (1);
+	}
+
+	/*
 	 * VM exits that can be triggered during event delivery need to
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
@@ -2306,6 +2327,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 
+		intr_vec = intr_info & 0xff;
+		intr_type = intr_info & VMCS_INTR_T_MASK;
+
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
 		 * fault encountered during the execution of IRET then we must
@@ -2316,16 +2340,55 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
-		    (intr_info & 0xff) != IDT_DF &&
+		    (intr_vec != IDT_DF) &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
-		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
+		if (intr_type == VMCS_INTR_T_NMI)
 			return (1);
-		break;
+
+		/*
+		 * Call the machine check handler by hand. Also don't reflect
+		 * the machine check back into the guest.
+		 */
+		if (intr_vec == IDT_MC) {
+			VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
+			__asm __volatile("int $18");
+			return (1);
+		}
+
+		if (intr_vec == IDT_PF) {
+			error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
+			KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
+			    __func__, error));
+		}
+
+		/*
+		 * Software exceptions exhibit trap-like behavior. This in
+		 * turn requires populating the VM-entry instruction length
+		 * so that the %rip in the trap frame is past the INT3/INTO
+		 * instruction.
+		 */
+		if (intr_type == VMCS_INTR_T_SWEXCEPTION)
+			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
+
+		/* Reflect all other exceptions back into the guest */
+		bzero(&vmexc, sizeof(struct vm_exception));
+		vmexc.vector = intr_vec;
+		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
+			vmexc.error_code_valid = 1;
+			vmexc.error_code = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
+		}
+		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
+		    "the guest", vmexc.vector, vmexc.error_code);
+		error = vm_inject_exception(vmx->vm, vcpu, &vmexc);
+		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
+		    __func__, error));
+		return (1);
+
 	case EXIT_REASON_EPT_FAULT:
 		/*
 		 * If 'gpa' lies within the address space allocated to
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index d684dba..b192735 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -912,8 +912,12 @@ vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
-	lapic->tpr = val;
-	vlapic_update_ppr(vlapic);
+	if (lapic->tpr != val) {
+		VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed "
+		    "from %#x to %#x", lapic->tpr, val);
+		lapic->tpr = val;
+		vlapic_update_ppr(vlapic);
+	}
 }
 
 static uint8_t
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index ddf875b..8c545f0 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -206,6 +206,11 @@ static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
+static int trace_guest_exceptions;
+SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
+    &trace_guest_exceptions, 0,
+    "Trap into hypervisor on all guest exceptions and reflect them back");
+
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
@@ -249,6 +254,13 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 	vmm_stat_init(vcpu->stats);
 }
 
+int
+vcpu_trace_exceptions(struct vm *vm, int vcpuid)
+{
+
+	return (trace_guest_exceptions);
+}
+
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
@@ -1232,7 +1244,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
-	int cs_d, error;
+	int cs_d, error, length;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
@@ -1246,11 +1258,21 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 
 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
 
-	vie_init(vie);
-
 	/* Fetch, decode and emulate the faulting instruction */
-	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
-	    vme->inst_length, vie);
+	if (vie->num_valid == 0) {
+		/*
+		 * If the instruction length is not known then assume a
+		 * maximum size instruction.
+		 */
+		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
+		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
+		    length, vie);
+	} else {
+		/*
+		 * The instruction bytes have already been copied into 'vie'
+		 */
+		error = 0;
+	}
 	if (error == 1)
 		return (0);		/* Resume guest to handle page fault */
 	else if (error == -1)
@@ -1261,6 +1283,12 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 		return (EFAULT);
 
+	/*
+	 * If the instruction length is not specified the update it now.
+	 */
+	if (vme->inst_length == 0)
+		vme->inst_length = vie->num_processed;
+ 
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index c6ba01e..d1d7173 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -1184,13 +1184,20 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 
 #ifdef _KERNEL
 void
-vie_init(struct vie *vie)
+vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
 {
+	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
+	    ("%s: invalid instruction length (%d)", __func__, inst_length));
 
 	bzero(vie, sizeof(struct vie));
 
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
+
+	if (inst_length) {
+		bcopy(inst_bytes, vie->inst, inst_length);
+		vie->num_valid = inst_length;
+	}
 }
 
 static int
@@ -1826,7 +1833,7 @@ static int
 verify_inst_length(struct vie *vie)
 {
 
-	if (vie->num_processed == vie->num_valid)
+	if (vie->num_processed)
 		return (0);
 	else
 		return (-1);
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index c7515cf..c0c3e9c 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -44,6 +44,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmm.h>
 
 #include "vmm_host.h"
+#include "vmm_ktr.h"
+#include "vmm_util.h"
 #include "x86.h"
 
 SYSCTL_DECL(_hw_vmm);
@@ -54,6 +56,8 @@ static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
 static const char bhyve_id[12] = "bhyve bhyve ";
 
 static uint64_t bhyve_xcpuids;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
+    "Number of times an unknown cpuid leaf was accessed");
 
 /*
  * The default CPU topology is a single thread per package.
@@ -91,6 +95,8 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 	unsigned int func, regs[4], logical_cpus;
 	enum x2apic_state x2apic_state;
 
+	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
+
 	/*
 	 * Requests for invalid CPUID levels should map to the highest
 	 * available level instead.
@@ -124,25 +130,80 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		case CPUID_8000_0003:
 		case CPUID_8000_0004:
 		case CPUID_8000_0006:
+			cpuid_count(*eax, *ecx, regs);
+			break;
 		case CPUID_8000_0008:
 			cpuid_count(*eax, *ecx, regs);
+			if (vmm_is_amd()) {
+				/*
+				 * XXX this might appear silly because AMD
+				 * cpus don't have threads.
+				 *
+				 * However this matches the logical cpus as
+				 * advertised by leaf 0x1 and will work even
+				 * if the 'threads_per_core' tunable is set
+				 * incorrectly on an AMD host.
+				 */
+				logical_cpus = threads_per_core *
+				    cores_per_package;
+				regs[2] = logical_cpus - 1;
+			}
 			break;
 
 		case CPUID_8000_0001:
+			cpuid_count(*eax, *ecx, regs);
+
+			/*
+			 * Hide SVM and Topology Extension features from guest.
+			 */
+			regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
+
+			/*
+			 * Don't advertise extended performance counter MSRs
+			 * to the guest.
+			 */
+			regs[2] &= ~AMDID2_PCXC;
+			regs[2] &= ~AMDID2_PNXC;
+			regs[2] &= ~AMDID2_PTSCEL2I;
+
+			/*
+			 * Don't advertise Instruction Based Sampling feature.
+			 */
+			regs[2] &= ~AMDID2_IBS;
+
+			/* NodeID MSR not available */
+			regs[2] &= ~AMDID2_NODE_ID;
+
+			/* Don't advertise the OS visible workaround feature */
+			regs[2] &= ~AMDID2_OSVW;
+
 			/*
 			 * Hide rdtscp/ia32_tsc_aux until we know how
 			 * to deal with them.
 			 */
-			cpuid_count(*eax, *ecx, regs);
 			regs[3] &= ~AMDID_RDTSCP;
 			break;
 
 		case CPUID_8000_0007:
-			cpuid_count(*eax, *ecx, regs);
 			/*
-			 * If the host TSCs are not synchronized across
-			 * physical cpus then we cannot advertise an
-			 * invariant tsc to a vcpu.
+			 * AMD uses this leaf to advertise the processor's
+			 * power monitoring and RAS capabilities. These
+			 * features are hardware-specific and exposing
+			 * them to a guest doesn't make a lot of sense.
+			 *
+			 * Intel uses this leaf only to advertise the
+			 * "Invariant TSC" feature with all other bits
+			 * being reserved (set to zero).
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+
+			/*
+			 * "Invariant TSC" can be advertised to the guest if:
+			 * - host TSC frequency is invariant
+			 * - host TSCs are synchronized across physical cpus
 			 *
 			 * XXX This still falls short because the vcpu
 			 * can observe the TSC moving backwards as it
@@ -150,8 +211,8 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			 * it should discourage the guest from using the
 			 * TSC to keep track of time.
 			 */
-			if (!smp_tsc)
-				regs[3] &= ~AMDPM_TSC_INVARIANT;
+			if (tsc_is_invariant && smp_tsc)
+				regs[3] |= AMDPM_TSC_INVARIANT;
 			break;
 
 		case CPUID_0000_0001:
author	neel <neel@FreeBSD.org>	2014-12-30 08:24:14 +0000
committer	neel <neel@FreeBSD.org>	2014-12-30 08:24:14 +0000
commit	9a7db864f78c4821164e142b15574dd789e438fc (patch)
tree	5ca9ce0dfb46b18acadddbaeeda0e8409ebb1eac /sys/amd64
parent	6d931c08fa2abce728837379fb2549a6513f49b0 (diff)
download	FreeBSD-src-9a7db864f78c4821164e142b15574dd789e438fc.zip FreeBSD-src-9a7db864f78c4821164e142b15574dd789e438fc.tar.gz