summaryrefslogtreecommitdiffstats
path: root/sys/amd64/vmm/intel
diff options
context:
space:
mode:
Diffstat (limited to 'sys/amd64/vmm/intel')
-rw-r--r--sys/amd64/vmm/intel/ept.c392
-rw-r--r--sys/amd64/vmm/intel/ept.h43
-rw-r--r--sys/amd64/vmm/intel/vmcs.c551
-rw-r--r--sys/amd64/vmm/intel/vmcs.h338
-rw-r--r--sys/amd64/vmm/intel/vmx.c1845
-rw-r--r--sys/amd64/vmm/intel/vmx.h120
-rw-r--r--sys/amd64/vmm/intel/vmx_controls.h92
-rw-r--r--sys/amd64/vmm/intel/vmx_cpufunc.h218
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c89
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.c172
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.h78
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S246
-rw-r--r--sys/amd64/vmm/intel/vtd.c677
13 files changed, 4861 insertions, 0 deletions
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..4f91601
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define EPT_PWL4(cap) ((cap) & (1UL << 6))
+#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
+#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
+#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
+#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
+#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
+
+#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
+#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define INVEPT_ALL_TYPES_MASK 0x6000000UL
+#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define EPT_PG_RD (1 << 0)
+#define EPT_PG_WR (1 << 1)
+#define EPT_PG_EX (1 << 2)
+#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
+#define EPT_PG_IGNORE_PAT (1 << 6)
+#define EPT_PG_SUPERPAGE (1 << 7)
+
+#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+ int page_shift;
+ uint64_t cap;
+
+ cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+ /*
+ * Verify that:
+ * - page walk length is 4 steps
+ * - extended page tables can be laid out in write-back memory
+ * - invvpid instruction with all possible types is supported
+ * - invept instruction with all possible types is supported
+ */
+ if (!EPT_PWL4(cap) ||
+ !EPT_MEMORY_TYPE_WB(cap) ||
+ !INVVPID_SUPPORTED(cap) ||
+ !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+ !INVEPT_SUPPORTED(cap) ||
+ !INVEPT_ALL_TYPES_SUPPORTED(cap))
+ return (EINVAL);
+
+ /* Set bits in 'page_sizes_mask' for each valid page size */
+ page_shift = PAGE_SHIFT;
+ page_sizes_mask = 1UL << page_shift; /* 4KB page */
+
+ page_shift += 9;
+ if (EPT_PDE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
+
+ page_shift += 9;
+ if (EPT_PDPTE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
+
+ return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+ int i, t, tabs;
+ uint64_t *ptpnext, ptpval;
+
+ if (--nlevels < 0)
+ return;
+
+ tabs = 3 - nlevels;
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("PTP = %p\n", ptp);
+
+ for (i = 0; i < 512; i++) {
+ ptpval = ptp[i];
+
+ if (ptpval == 0)
+ continue;
+
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("%3d 0x%016lx\n", i, ptpval);
+
+ if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+ ptpnext = (uint64_t *)
+ PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ ept_dump(ptpnext, nlevels);
+ }
+ }
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+ int spshift, ptpshift, ptpindex, nlevels;
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - super page sizes supported by the processor
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = PAGE_SHIFT;
+ if (spok)
+ spshift += (EPT_PWLEVELS - 1) * 9;
+ while (spshift >= PAGE_SHIFT) {
+ uint64_t spsize = 1UL << spshift;
+ if ((page_sizes_mask & spsize) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ length >= spsize) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ if (spshift < PAGE_SHIFT) {
+ panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+ "length 0x%016lx, page_sizes_mask 0x%016lx",
+ gpa, hpa, length, page_sizes_mask);
+ }
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift)
+ break;
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create the next level page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp);
+ ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+ panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+ "mismatch\n", gpa, ptpshift);
+ }
+
+ if (prot != VM_PROT_NONE) {
+ /* Do the mapping */
+ ptp[ptpindex] = hpa;
+
+ /* Apply the access controls */
+ if (prot & VM_PROT_READ)
+ ptp[ptpindex] |= EPT_PG_RD;
+ if (prot & VM_PROT_WRITE)
+ ptp[ptpindex] |= EPT_PG_WR;
+ if (prot & VM_PROT_EXECUTE)
+ ptp[ptpindex] |= EPT_PG_EX;
+
+ /*
+ * XXX should we enforce this memory type by setting the
+ * ignore PAT bit to 1.
+ */
+ ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+ } else {
+ /* Remove the mapping */
+ ptp[ptpindex] = 0;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+ int nlevels, ptpshift, ptpindex;
+ uint64_t ptpval, hpabase, pgmask;
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ ptpval = ptp[ptpindex];
+
+ /* Cannot make progress beyond this point */
+ if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+ break;
+
+ if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+ pgmask = (1UL << ptpshift) - 1;
+ hpabase = ptpval & ~pgmask;
+ return (hpabase | (gpa & pgmask));
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ }
+
+ return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+ if (pte == 0)
+ return;
+
+ /* sanity check */
+ if ((pte & EPT_PG_SUPERPAGE) != 0)
+ panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+ return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+ pt_entry_t *pt;
+ int i;
+
+ if (pde == 0)
+ return;
+
+ if ((pde & EPT_PG_SUPERPAGE) == 0) {
+ pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+ for (i = 0; i < NPTEPG; i++)
+ ept_free_pt_entry(pt[i]);
+ free(pt, M_VMX); /* free the page table page */
+ }
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+ pd_entry_t *pd;
+ int i;
+
+ if (pdpe == 0)
+ return;
+
+ if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+ pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+ for (i = 0; i < NPDEPG; i++)
+ ept_free_pd_entry(pd[i]);
+ free(pd, M_VMX); /* free the page directory page */
+ }
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+ pdp_entry_t *pdp;
+ int i;
+
+ if (pml4e == 0)
+ return;
+
+ if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+ for (i = 0; i < NPDPEPG; i++)
+ ept_free_pdp_entry(pdp[i]);
+ free(pdp, M_VMX); /* free the page directory ptr page */
+ }
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+ int i;
+
+ for (i = 0; i < NPML4EPG; i++)
+ ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+ size_t n;
+ struct vmx *vmx = arg;
+
+ while (len > 0) {
+ n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+ prot, spok);
+ len -= n;
+ gpa += n;
+ hpa += n;
+ }
+
+ return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+ vm_paddr_t hpa;
+ struct vmx *vmx;
+
+ vmx = arg;
+ hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+ return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+ struct invept_desc desc = *(struct invept_desc *)arg;
+
+ invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+ struct invept_desc invept_desc = { 0 };
+
+ invept_desc.eptp = EPTP(pml4ept);
+
+ smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..2d7258d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EPT_H_
+#define _EPT_H_
+
+struct vmx;
+
+#define EPT_PWLEVELS 4 /* page walk levels */
+#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int ept_init(void);
+int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void ept_invalidate_mappings(u_long ept_pml4);
+void ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..a5784dd
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,551 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+ switch (encoding) {
+ case VMCS_GUEST_CR0:
+ val = vmx_fix_cr0(val);
+ break;
+ case VMCS_GUEST_CR4:
+ val = vmx_fix_cr4(val);
+ break;
+ default:
+ break;
+ }
+ return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+ switch (ident) {
+ case VM_REG_GUEST_CR0:
+ return (VMCS_GUEST_CR0);
+ case VM_REG_GUEST_CR3:
+ return (VMCS_GUEST_CR3);
+ case VM_REG_GUEST_CR4:
+ return (VMCS_GUEST_CR4);
+ case VM_REG_GUEST_DR7:
+ return (VMCS_GUEST_DR7);
+ case VM_REG_GUEST_RSP:
+ return (VMCS_GUEST_RSP);
+ case VM_REG_GUEST_RIP:
+ return (VMCS_GUEST_RIP);
+ case VM_REG_GUEST_RFLAGS:
+ return (VMCS_GUEST_RFLAGS);
+ case VM_REG_GUEST_ES:
+ return (VMCS_GUEST_ES_SELECTOR);
+ case VM_REG_GUEST_CS:
+ return (VMCS_GUEST_CS_SELECTOR);
+ case VM_REG_GUEST_SS:
+ return (VMCS_GUEST_SS_SELECTOR);
+ case VM_REG_GUEST_DS:
+ return (VMCS_GUEST_DS_SELECTOR);
+ case VM_REG_GUEST_FS:
+ return (VMCS_GUEST_FS_SELECTOR);
+ case VM_REG_GUEST_GS:
+ return (VMCS_GUEST_GS_SELECTOR);
+ case VM_REG_GUEST_TR:
+ return (VMCS_GUEST_TR_SELECTOR);
+ case VM_REG_GUEST_LDTR:
+ return (VMCS_GUEST_LDTR_SELECTOR);
+ case VM_REG_GUEST_EFER:
+ return (VMCS_GUEST_IA32_EFER);
+ default:
+ return (-1);
+ }
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+ switch (seg) {
+ case VM_REG_GUEST_ES:
+ *base = VMCS_GUEST_ES_BASE;
+ *lim = VMCS_GUEST_ES_LIMIT;
+ *acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_CS:
+ *base = VMCS_GUEST_CS_BASE;
+ *lim = VMCS_GUEST_CS_LIMIT;
+ *acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_SS:
+ *base = VMCS_GUEST_SS_BASE;
+ *lim = VMCS_GUEST_SS_LIMIT;
+ *acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_DS:
+ *base = VMCS_GUEST_DS_BASE;
+ *lim = VMCS_GUEST_DS_LIMIT;
+ *acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_FS:
+ *base = VMCS_GUEST_FS_BASE;
+ *lim = VMCS_GUEST_FS_LIMIT;
+ *acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_GS:
+ *base = VMCS_GUEST_GS_BASE;
+ *lim = VMCS_GUEST_GS_LIMIT;
+ *acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_TR:
+ *base = VMCS_GUEST_TR_BASE;
+ *lim = VMCS_GUEST_TR_LIMIT;
+ *acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_LDTR:
+ *base = VMCS_GUEST_LDTR_BASE;
+ *lim = VMCS_GUEST_LDTR_LIMIT;
+ *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_IDTR:
+ *base = VMCS_GUEST_IDTR_BASE;
+ *lim = VMCS_GUEST_IDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ case VM_REG_GUEST_GDTR:
+ *base = VMCS_GUEST_GDTR_BASE;
+ *lim = VMCS_GUEST_GDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+ int error;
+ uint32_t encoding;
+
+ /*
+ * If we need to get at vmx-specific state in the VMCS we can bypass
+ * the translation of 'ident' to 'encoding' by simply setting the
+ * sign bit. As it so happens the upper 16 bits are reserved (i.e
+ * set to 0) in the encodings for the VMCS so we are free to use the
+ * sign bit.
+ */
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ VMPTRLD(vmcs);
+ error = vmread(encoding, retval);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+ int error;
+ uint32_t encoding;
+
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ val = vmcs_fix_regval(encoding, val);
+
+ VMPTRLD(vmcs);
+ error = vmwrite(encoding, val);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_setdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmwrite(base, desc->base)) != 0)
+ goto done;
+
+ if ((error = vmwrite(limit, desc->limit)) != 0)
+ goto done;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmwrite(access, desc->access)) != 0)
+ goto done;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+ uint64_t u64;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_getdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmread(base, &u64)) != 0)
+ goto done;
+ desc->base = u64;
+
+ if ((error = vmread(limit, &u64)) != 0)
+ goto done;
+ desc->limit = u64;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmread(access, &u64)) != 0)
+ goto done;
+ desc->access = u64;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+ int error;
+
+ VMPTRLD(vmcs);
+
+ /*
+ * Guest MSRs are saved in the VM-exit MSR-store area.
+ * Guest MSRs are loaded from the VM-entry MSR-load area.
+ * Both areas point to the same location in memory.
+ */
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+ u_long host_rip, u_long host_rsp, u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+ int error, codesel, datasel, tsssel;
+ u_long cr0, cr4, efer;
+ uint64_t eptp, pat, fsbase, idtrbase;
+ uint32_t exc_bitmap;
+
+ codesel = vmm_get_host_codesel();
+ datasel = vmm_get_host_datasel();
+ tsssel = vmm_get_host_tsssel();
+
+ /*
+ * Make sure we have a "current" VMCS to work with.
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * Load the VMX controls
+ */
+ if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+ goto done;
+
+ /* Guest state */
+
+ /* Initialize guest IA32_PAT MSR with the default value */
+ pat = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Host state */
+
+ /* Initialize host IA32_PAT MSR */
+ pat = vmm_get_host_pat();
+ if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Load the IA32_EFER MSR */
+ efer = vmm_get_host_efer();
+ if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+ goto done;
+
+ /* Load the control registers */
+
+ cr0 = vmm_get_host_cr0();
+ if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = vmm_get_host_cr4() | CR4_VMXE;
+ if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+ goto done;
+
+ /* Load the segment selectors */
+ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+ goto done;
+
+ /*
+ * Load the Base-Address for %fs and idtr.
+ *
+ * Note that we exclude %gs, tss and gdtr here because their base
+ * address is pcpu specific.
+ */
+ fsbase = vmm_get_host_fsbase();
+ if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+ goto done;
+
+ idtrbase = vmm_get_host_idtrbase();
+ if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+ goto done;
+
+ /* instruction pointer */
+ if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+ goto done;
+
+ /* stack pointer */
+ if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+ goto done;
+
+ /* eptp */
+ eptp = EPTP(ept_pml4);
+ if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+ goto done;
+
+ /* vpid */
+ if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+ goto done;
+
+ /* msr bitmap */
+ if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+ goto done;
+
+ /* exception bitmap */
+ exc_bitmap = 1 << IDT_MC;
+ if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+ goto done;
+
+ /* link pointer */
+ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+ goto done;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+ int error;
+ uint64_t val;
+
+ error = vmread(encoding, &val);
+ if (error != 0)
+ panic("vmcs_read(%u) error %d", encoding, error);
+
+ return (val);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+ uint64_t cur_vmcs, val;
+ uint32_t exit;
+
+ if (!vmxon_enabled[curcpu]) {
+ db_printf("VMX not enabled\n");
+ return;
+ }
+
+ if (have_addr) {
+ db_printf("Only current VMCS supported\n");
+ return;
+ }
+
+ vmptrst(&cur_vmcs);
+ if (cur_vmcs == VMCS_INITIAL) {
+ db_printf("No current VM context\n");
+ return;
+ }
+ db_printf("VMCS: %jx\n", cur_vmcs);
+ db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+ db_printf("Activity: ");
+ val = vmcs_read(VMCS_GUEST_ACTIVITY);
+ switch (val) {
+ case 0:
+ db_printf("Active");
+ break;
+ case 1:
+ db_printf("HLT");
+ break;
+ case 2:
+ db_printf("Shutdown");
+ break;
+ case 3:
+ db_printf("Wait for SIPI");
+ break;
+ default:
+ db_printf("Unknown: %#lx", val);
+ }
+ db_printf("\n");
+ exit = vmcs_read(VMCS_EXIT_REASON);
+ if (exit & 0x80000000)
+ db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+ else
+ db_printf("Exit Reason: %u\n", exit & 0xffff);
+ db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+ db_printf("Guest Linear Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+ switch (exit & 0x8000ffff) {
+ case EXIT_REASON_EXCEPTION:
+ case EXIT_REASON_EXT_INTR:
+ val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+ db_printf("Interrupt Type: ");
+ switch (val >> 8 & 0x7) {
+ case 0:
+ db_printf("external");
+ break;
+ case 2:
+ db_printf("NMI");
+ break;
+ case 3:
+ db_printf("HW exception");
+ break;
+ case 4:
+ db_printf("SW exception");
+ break;
+ default:
+ db_printf("?? %lu", val >> 8 & 0x7);
+ break;
+ }
+ db_printf(" Vector: %lu", val & 0xff);
+ if (val & 0x800)
+ db_printf(" Error Code: %lx",
+ vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+ db_printf("\n");
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ case EXIT_REASON_EPT_MISCONFIG:
+ db_printf("Guest Physical Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+ break;
+ }
+ db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..f39eed2
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,338 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define _VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+ uint32_t identifier;
+ uint32_t abort_code;
+ char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+ u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap,
+ uint16_t vpid);
+int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int vmcs_getdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+int vmcs_setdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
+#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
+#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+
+#endif /* _KERNEL */
+
+#define VMCS_INITIAL 0xffffffffffffffff
+
+#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define VMCS_INVALID_ENCODING 0xffffffff
+
+/* 16-bit control fields */
+#define VMCS_VPID 0x00000000
+
+/* 16-bit guest-state fields */
+#define VMCS_GUEST_ES_SELECTOR 0x00000800
+#define VMCS_GUEST_CS_SELECTOR 0x00000802
+#define VMCS_GUEST_SS_SELECTOR 0x00000804
+#define VMCS_GUEST_DS_SELECTOR 0x00000806
+#define VMCS_GUEST_FS_SELECTOR 0x00000808
+#define VMCS_GUEST_GS_SELECTOR 0x0000080A
+#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
+#define VMCS_GUEST_TR_SELECTOR 0x0000080E
+
+/* 16-bit host-state fields */
+#define VMCS_HOST_ES_SELECTOR 0x00000C00
+#define VMCS_HOST_CS_SELECTOR 0x00000C02
+#define VMCS_HOST_SS_SELECTOR 0x00000C04
+#define VMCS_HOST_DS_SELECTOR 0x00000C06
+#define VMCS_HOST_FS_SELECTOR 0x00000C08
+#define VMCS_HOST_GS_SELECTOR 0x00000C0A
+#define VMCS_HOST_TR_SELECTOR 0x00000C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x00002000
+#define VMCS_IO_BITMAP_B 0x00002002
+#define VMCS_MSR_BITMAP 0x00002004
+#define VMCS_EXIT_MSR_STORE 0x00002006
+#define VMCS_EXIT_MSR_LOAD 0x00002008
+#define VMCS_ENTRY_MSR_LOAD 0x0000200A
+#define VMCS_EXECUTIVE_VMCS 0x0000200C
+#define VMCS_TSC_OFFSET 0x00002010
+#define VMCS_VIRTUAL_APIC 0x00002012
+#define VMCS_APIC_ACCESS 0x00002014
+#define VMCS_EPTP 0x0000201A
+
+/* 64-bit read-only fields */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+
+/* 64-bit guest-state fields */
+#define VMCS_LINK_POINTER 0x00002800
+#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
+#define VMCS_GUEST_IA32_PAT 0x00002804
+#define VMCS_GUEST_IA32_EFER 0x00002806
+#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define VMCS_GUEST_PDPTE0 0x0000280A
+#define VMCS_GUEST_PDPTE1 0x0000280C
+#define VMCS_GUEST_PDPTE2 0x0000280E
+#define VMCS_GUEST_PDPTE3 0x00002810
+
+/* 64-bit host-state fields */
+#define VMCS_HOST_IA32_PAT 0x00002C00
+#define VMCS_HOST_IA32_EFER 0x00002C02
+#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
+
+/* 32-bit control fields */
+#define VMCS_PIN_BASED_CTLS 0x00004000
+#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
+#define VMCS_EXCEPTION_BITMAP 0x00004004
+#define VMCS_PF_ERROR_MASK 0x00004006
+#define VMCS_PF_ERROR_MATCH 0x00004008
+#define VMCS_CR3_TARGET_COUNT 0x0000400A
+#define VMCS_EXIT_CTLS 0x0000400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
+#define VMCS_ENTRY_CTLS 0x00004012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
+#define VMCS_ENTRY_INTR_INFO 0x00004016
+#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
+#define VMCS_ENTRY_INST_LENGTH 0x0000401A
+#define VMCS_TPR_THRESHOLD 0x0000401C
+#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
+#define VMCS_PLE_GAP 0x00004020
+#define VMCS_PLE_WINDOW 0x00004022
+
+/* 32-bit read-only data fields */
+#define VMCS_INSTRUCTION_ERROR 0x00004400
+#define VMCS_EXIT_REASON 0x00004402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
+#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
+#define VMCS_IDT_VECTORING_INFO 0x00004408
+#define VMCS_IDT_VECTORING_ERROR 0x0000440A
+#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
+
+/* 32-bit guest-state fields */
+#define VMCS_GUEST_ES_LIMIT 0x00004800
+#define VMCS_GUEST_CS_LIMIT 0x00004802
+#define VMCS_GUEST_SS_LIMIT 0x00004804
+#define VMCS_GUEST_DS_LIMIT 0x00004806
+#define VMCS_GUEST_FS_LIMIT 0x00004808
+#define VMCS_GUEST_GS_LIMIT 0x0000480A
+#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
+#define VMCS_GUEST_TR_LIMIT 0x0000480E
+#define VMCS_GUEST_GDTR_LIMIT 0x00004810
+#define VMCS_GUEST_IDTR_LIMIT 0x00004812
+#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
+#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
+#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
+#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
+#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
+#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
+#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
+#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
+#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
+#define VMCS_GUEST_ACTIVITY 0x00004826
+#define VMCS_GUEST_SMBASE 0x00004828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
+#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
+
+/* 32-bit host state fields */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
+
+/* Natural Width control fields */
+#define VMCS_CR0_MASK 0x00006000
+#define VMCS_CR4_MASK 0x00006002
+#define VMCS_CR0_SHADOW 0x00006004
+#define VMCS_CR4_SHADOW 0x00006006
+#define VMCS_CR3_TARGET0 0x00006008
+#define VMCS_CR3_TARGET1 0x0000600A
+#define VMCS_CR3_TARGET2 0x0000600C
+#define VMCS_CR3_TARGET3 0x0000600E
+
+/* Natural Width read-only fields */
+#define VMCS_EXIT_QUALIFICATION 0x00006400
+#define VMCS_IO_RCX 0x00006402
+#define VMCS_IO_RSI 0x00006404
+#define VMCS_IO_RDI 0x00006406
+#define VMCS_IO_RIP 0x00006408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
+
+/* Natural Width guest-state fields */
+#define VMCS_GUEST_CR0 0x00006800
+#define VMCS_GUEST_CR3 0x00006802
+#define VMCS_GUEST_CR4 0x00006804
+#define VMCS_GUEST_ES_BASE 0x00006806
+#define VMCS_GUEST_CS_BASE 0x00006808
+#define VMCS_GUEST_SS_BASE 0x0000680A
+#define VMCS_GUEST_DS_BASE 0x0000680C
+#define VMCS_GUEST_FS_BASE 0x0000680E
+#define VMCS_GUEST_GS_BASE 0x00006810
+#define VMCS_GUEST_LDTR_BASE 0x00006812
+#define VMCS_GUEST_TR_BASE 0x00006814
+#define VMCS_GUEST_GDTR_BASE 0x00006816
+#define VMCS_GUEST_IDTR_BASE 0x00006818
+#define VMCS_GUEST_DR7 0x0000681A
+#define VMCS_GUEST_RSP 0x0000681C
+#define VMCS_GUEST_RIP 0x0000681E
+#define VMCS_GUEST_RFLAGS 0x00006820
+#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
+
+/* Natural Width host-state fields */
+#define VMCS_HOST_CR0 0x00006C00
+#define VMCS_HOST_CR3 0x00006C02
+#define VMCS_HOST_CR4 0x00006C04
+#define VMCS_HOST_FS_BASE 0x00006C06
+#define VMCS_HOST_GS_BASE 0x00006C08
+#define VMCS_HOST_TR_BASE 0x00006C0A
+#define VMCS_HOST_GDTR_BASE 0x00006C0C
+#define VMCS_HOST_IDTR_BASE 0x00006C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
+#define VMCS_HOST_RSP 0x00006C14
+#define VMCS_HOST_RIP 0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION 0
+#define EXIT_REASON_EXT_INTR 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT 3
+#define EXIT_REASON_SIPI 4
+#define EXIT_REASON_IO_SMI 5
+#define EXIT_REASON_SMI 6
+#define EXIT_REASON_INTR_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_GETSEC 11
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_RSM 17
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMXOFF 26
+#define EXIT_REASON_VMXON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_INOUT 30
+#define EXIT_REASON_RDMSR 31
+#define EXIT_REASON_WRMSR 32
+#define EXIT_REASON_INVAL_VMCS 33
+#define EXIT_REASON_INVAL_MSR 34
+#define EXIT_REASON_MWAIT 36
+#define EXIT_REASON_MTF 37
+#define EXIT_REASON_MONITOR 39
+#define EXIT_REASON_PAUSE 40
+#define EXIT_REASON_MCE 41
+#define EXIT_REASON_TPR 43
+#define EXIT_REASON_APIC 44
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_FAULT 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_VMX_PREEMPT 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define VMCS_INTERRUPTION_INFO_VALID (1U << 31)
+#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
+#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
+#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
+#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
+#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define EPT_VIOLATION_DATA_READ (1UL << 0)
+#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
+#define EPT_VIOLATION_INST_FETCH (1UL << 2)
+#define EPT_VIOLATION_GLA_VALID (1UL << 7)
+#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..4f267bb
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1845 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define PINBASED_CTLS_ONE_SETTING \
+ (PINBASED_EXTINT_EXITING | \
+ PINBASED_NMI_EXITING | \
+ PINBASED_VIRTUAL_NMI)
+#define PINBASED_CTLS_ZERO_SETTING 0
+
+#define PROCBASED_CTLS_WINDOW_SETTING \
+ (PROCBASED_INT_WINDOW_EXITING | \
+ PROCBASED_NMI_WINDOW_EXITING)
+
+#define PROCBASED_CTLS_ONE_SETTING \
+ (PROCBASED_SECONDARY_CONTROLS | \
+ PROCBASED_IO_EXITING | \
+ PROCBASED_MSR_BITMAPS | \
+ PROCBASED_CTLS_WINDOW_SETTING)
+#define PROCBASED_CTLS_ZERO_SETTING \
+ (PROCBASED_CR3_LOAD_EXITING | \
+ PROCBASED_CR3_STORE_EXITING | \
+ PROCBASED_IO_BITMAPS)
+
+#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
+#define PROCBASED_CTLS2_ZERO_SETTING 0
+
+#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \
+ (VM_EXIT_HOST_LMA | \
+ VM_EXIT_SAVE_EFER | \
+ VM_EXIT_LOAD_EFER)
+
+#define VM_EXIT_CTLS_ONE_SETTING \
+ (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \
+ VM_EXIT_SAVE_PAT | \
+ VM_EXIT_LOAD_PAT)
+#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER
+
+#define VM_ENTRY_CTLS_ONE_SETTING \
+ (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \
+ VM_ENTRY_LOAD_PAT)
+#define VM_ENTRY_CTLS_ZERO_SETTING \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_INTO_SMM | \
+ VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define guest_msr_rw(vmx, msr) \
+ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define HANDLED 1
+#define UNHANDLED 0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+static int vmx_no_patmsr;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+ VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
+static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+ static char reasonbuf[32];
+
+ switch (reason) {
+ case EXIT_REASON_EXCEPTION:
+ return "exception";
+ case EXIT_REASON_EXT_INTR:
+ return "extint";
+ case EXIT_REASON_TRIPLE_FAULT:
+ return "triplefault";
+ case EXIT_REASON_INIT:
+ return "init";
+ case EXIT_REASON_SIPI:
+ return "sipi";
+ case EXIT_REASON_IO_SMI:
+ return "iosmi";
+ case EXIT_REASON_SMI:
+ return "smi";
+ case EXIT_REASON_INTR_WINDOW:
+ return "intrwindow";
+ case EXIT_REASON_NMI_WINDOW:
+ return "nmiwindow";
+ case EXIT_REASON_TASK_SWITCH:
+ return "taskswitch";
+ case EXIT_REASON_CPUID:
+ return "cpuid";
+ case EXIT_REASON_GETSEC:
+ return "getsec";
+ case EXIT_REASON_HLT:
+ return "hlt";
+ case EXIT_REASON_INVD:
+ return "invd";
+ case EXIT_REASON_INVLPG:
+ return "invlpg";
+ case EXIT_REASON_RDPMC:
+ return "rdpmc";
+ case EXIT_REASON_RDTSC:
+ return "rdtsc";
+ case EXIT_REASON_RSM:
+ return "rsm";
+ case EXIT_REASON_VMCALL:
+ return "vmcall";
+ case EXIT_REASON_VMCLEAR:
+ return "vmclear";
+ case EXIT_REASON_VMLAUNCH:
+ return "vmlaunch";
+ case EXIT_REASON_VMPTRLD:
+ return "vmptrld";
+ case EXIT_REASON_VMPTRST:
+ return "vmptrst";
+ case EXIT_REASON_VMREAD:
+ return "vmread";
+ case EXIT_REASON_VMRESUME:
+ return "vmresume";
+ case EXIT_REASON_VMWRITE:
+ return "vmwrite";
+ case EXIT_REASON_VMXOFF:
+ return "vmxoff";
+ case EXIT_REASON_VMXON:
+ return "vmxon";
+ case EXIT_REASON_CR_ACCESS:
+ return "craccess";
+ case EXIT_REASON_DR_ACCESS:
+ return "draccess";
+ case EXIT_REASON_INOUT:
+ return "inout";
+ case EXIT_REASON_RDMSR:
+ return "rdmsr";
+ case EXIT_REASON_WRMSR:
+ return "wrmsr";
+ case EXIT_REASON_INVAL_VMCS:
+ return "invalvmcs";
+ case EXIT_REASON_INVAL_MSR:
+ return "invalmsr";
+ case EXIT_REASON_MWAIT:
+ return "mwait";
+ case EXIT_REASON_MTF:
+ return "mtf";
+ case EXIT_REASON_MONITOR:
+ return "monitor";
+ case EXIT_REASON_PAUSE:
+ return "pause";
+ case EXIT_REASON_MCE:
+ return "mce";
+ case EXIT_REASON_TPR:
+ return "tpr";
+ case EXIT_REASON_APIC:
+ return "apic";
+ case EXIT_REASON_GDTR_IDTR:
+ return "gdtridtr";
+ case EXIT_REASON_LDTR_TR:
+ return "ldtrtr";
+ case EXIT_REASON_EPT_FAULT:
+ return "eptfault";
+ case EXIT_REASON_EPT_MISCONFIG:
+ return "eptmisconfig";
+ case EXIT_REASON_INVEPT:
+ return "invept";
+ case EXIT_REASON_RDTSCP:
+ return "rdtscp";
+ case EXIT_REASON_VMX_PREEMPT:
+ return "vmxpreempt";
+ case EXIT_REASON_INVVPID:
+ return "invvpid";
+ case EXIT_REASON_WBINVD:
+ return "wbinvd";
+ case EXIT_REASON_XSETBV:
+ return "xsetbv";
+ default:
+ snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+ return (reasonbuf);
+ }
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ return "direct";
+ case VMX_RETURN_LONGJMP:
+ return "longjmp";
+ case VMX_RETURN_VMRESUME:
+ return "vmresume";
+ case VMX_RETURN_VMLAUNCH:
+ return "vmlaunch";
+ case VMX_RETURN_AST:
+ return "ast";
+ default:
+ return "unknown";
+ }
+}
+
+#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \
+ VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+ (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ uint64_t host_rip, host_rsp;
+
+ if (vmxctx != &vmx->ctx[vcpu])
+ panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+ vmxctx, &vmx->ctx[vcpu]);
+
+ VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+ VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+ vmx_setjmp_rc2str(rc), rc);
+
+ host_rsp = host_rip = ~0;
+ vmread(VMCS_HOST_RIP, &host_rip);
+ vmread(VMCS_HOST_RSP, &host_rsp);
+ VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+ host_rip, host_rsp);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ return;
+}
+#endif /* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+ return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+ return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+ int cnt;
+
+ static struct msr_entry guest_msrs[] = {
+ { MSR_KGSBASE, 0, 0 },
+ };
+
+ cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+ if (cnt > GUEST_MSR_MAX_ENTRIES)
+ panic("guest msr save area overrun");
+ bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+ *g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+ struct invvpid_desc invvpid_desc = { 0 };
+ struct invept_desc invept_desc = { 0 };
+
+ if (vmxon_enabled[curcpu]) {
+ /*
+ * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+ *
+ * VMXON or VMXOFF are not required to invalidate any TLB
+ * caching structures. This prevents potential retention of
+ * cached information in the TLB between distinct VMX episodes.
+ */
+ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+ invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+ vmxoff();
+ }
+ load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+ smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+ return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+ int error;
+
+ load_cr4(rcr4() | CR4_VMXE);
+
+ *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+ error = vmxon(vmxon_region[curcpu]);
+ if (error == 0)
+ vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+ int error;
+ uint64_t fixed0, fixed1, feature_control;
+ uint32_t tmp;
+
+ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+ if (!(cpu_feature2 & CPUID2_VMX)) {
+ printf("vmx_init: processor does not support VMX operation\n");
+ return (ENXIO);
+ }
+
+ /*
+ * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+ * are set (bits 0 and 2 respectively).
+ */
+ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if ((feature_control & 0x5) != 0x5) {
+ printf("vmx_init: VMX operation disabled by BIOS\n");
+ return (ENXIO);
+ }
+
+ /* Check support for primary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_CTLS_ONE_SETTING,
+ PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired primary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Clear the processor-based ctl bits that are set on demand */
+ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+ /* Check support for secondary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED_CTLS2_ONE_SETTING,
+ PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+ if (error) {
+ printf("vmx_init: processor does not support desired secondary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VPID */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_ENABLE_VPID, 0, &tmp);
+ if (error == 0)
+ procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+ /* Check support for pin-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS,
+ PINBASED_CTLS_ONE_SETTING,
+ PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "pin-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-exit controls */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ /* Try again without the PAT MSR bits */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
+ MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "exit controls\n");
+ return (error);
+ } else {
+ if (bootverbose)
+ printf("vmm: PAT MSR access not supported\n");
+ guest_msr_valid(MSR_PAT);
+ vmx_no_patmsr = 1;
+ }
+ }
+
+ /* Check support for VM-entry controls */
+ if (!vmx_no_patmsr) {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ } else {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ }
+
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "entry controls\n");
+ return (error);
+ }
+
+ /*
+ * Check support for optional features by testing them
+ * as individual bits
+ */
+ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_HLT_EXITING, 0,
+ &tmp) == 0);
+
+ cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_PROCBASED_CTLS,
+ PROCBASED_MTF, 0,
+ &tmp) == 0);
+
+ cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_PAUSE_EXITING, 0,
+ &tmp) == 0);
+
+ cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_UNRESTRICTED_GUEST, 0,
+ &tmp) == 0);
+
+ /* Initialize EPT */
+ error = ept_init();
+ if (error) {
+ printf("vmx_init: ept initialization failed (%d)\n", error);
+ return (error);
+ }
+
+ /*
+ * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+ */
+ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+ cr0_ones_mask = fixed0 & fixed1;
+ cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+ /*
+ * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+ * if unrestricted guest execution is allowed.
+ */
+ if (cap_unrestricted_guest)
+ cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+ /*
+ * Do not allow the guest to set CR0_NW or CR0_CD.
+ */
+ cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+ fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+ cr4_ones_mask = fixed0 & fixed1;
+ cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+ /* enable VMX operation */
+ smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+ uint16_t vpid = 0;
+
+ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+ do {
+ vpid = atomic_fetchadd_int(&nextvpid, 1);
+ } while (vpid == 0);
+ }
+
+ return (vpid);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs)
+{
+ int error, mask_ident, shadow_ident;
+ uint64_t mask_value, shadow_value;
+
+ if (which != 0 && which != 4)
+ panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+ if (which == 0) {
+ mask_ident = VMCS_CR0_MASK;
+ mask_value = cr0_ones_mask | cr0_zeros_mask;
+ shadow_ident = VMCS_CR0_SHADOW;
+ shadow_value = cr0_ones_mask;
+ } else {
+ mask_ident = VMCS_CR4_MASK;
+ mask_value = cr4_ones_mask | cr4_zeros_mask;
+ shadow_ident = VMCS_CR4_SHADOW;
+ shadow_value = cr4_ones_mask;
+ }
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value);
+ if (error)
+ return (error);
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value);
+ if (error)
+ return (error);
+
+ return (0);
+}
+#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs))
+#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+ uint16_t vpid;
+ int i, error, guest_msr_count;
+ struct vmx *vmx;
+
+ vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+ if ((uintptr_t)vmx & PAGE_MASK) {
+ panic("malloc of struct vmx not aligned on %d byte boundary",
+ PAGE_SIZE);
+ }
+ vmx->vm = vm;
+
+ /*
+ * Clean up EPTP-tagged guest physical and combined mappings
+ *
+ * VMX transitions are not required to invalidate any guest physical
+ * mappings. So, it may be possible for stale guest physical mappings
+ * to be present in the processor TLBs.
+ *
+ * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+ */
+ ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+ msr_bitmap_initialize(vmx->msr_bitmap);
+
+ /*
+ * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+ * The guest FSBASE and GSBASE are saved and restored during
+ * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+ * always restored from the vmcs host state area on vm-exit.
+ *
+ * Guest KGSBASE is saved and restored in the guest MSR save area.
+ * Host KGSBASE is restored before returning to userland from the pcb.
+ * There will be a window of time when we are executing in the host
+ * kernel context with a value of KGSBASE from the guest. This is ok
+ * because the value of KGSBASE is inconsequential in kernel context.
+ *
+ * MSR_EFER is saved and restored in the guest VMCS area on a
+ * VM exit and entry respectively. It is also restored from the
+ * host VMCS area on a VM exit.
+ */
+ if (guest_msr_rw(vmx, MSR_GSBASE) ||
+ guest_msr_rw(vmx, MSR_FSBASE) ||
+ guest_msr_rw(vmx, MSR_KGSBASE) ||
+ guest_msr_rw(vmx, MSR_EFER))
+ panic("vmx_vminit: error setting guest msr access");
+
+ /*
+ * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+ * and entry respectively. It is also restored from the host VMCS
+ * area on a VM exit. However, if running on a system with no
+ * MSR_PAT save/restore support, leave access disabled so accesses
+ * will be trapped.
+ */
+ if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+ panic("vmx_vminit: error setting guest pat msr access");
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vmx->vmcs[i].identifier = vmx_revision();
+ error = vmclear(&vmx->vmcs[i]);
+ if (error != 0) {
+ panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+ error, i);
+ }
+
+ vpid = vmx_vpid();
+
+ error = vmcs_set_defaults(&vmx->vmcs[i],
+ (u_long)vmx_longjmp,
+ (u_long)&vmx->ctx[i],
+ vtophys(vmx->pml4ept),
+ pinbased_ctls,
+ procbased_ctls,
+ procbased_ctls2,
+ exit_ctls, entry_ctls,
+ vtophys(vmx->msr_bitmap),
+ vpid);
+
+ if (error != 0)
+ panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+ vmx->cap[i].set = 0;
+ vmx->cap[i].proc_ctls = procbased_ctls;
+
+ vmx->state[i].lastcpu = -1;
+ vmx->state[i].vpid = vpid;
+
+ msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+ error = vmcs_set_msr_save(&vmx->vmcs[i],
+ vtophys(vmx->guest_msrs[i]),
+ guest_msr_count);
+ if (error != 0)
+ panic("vmcs_set_msr_save error %d", error);
+
+ error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr0_shadow %d", error);
+
+ error = vmx_setup_cr4_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr4_shadow %d", error);
+ }
+
+ return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+ int handled, func;
+
+ func = vmxctx->guest_rax;
+
+ handled = x86_emulate_cpuid(vm, vcpu,
+ (uint32_t*)(&vmxctx->guest_rax),
+ (uint32_t*)(&vmxctx->guest_rbx),
+ (uint32_t*)(&vmxctx->guest_rcx),
+ (uint32_t*)(&vmxctx->guest_rdx));
+ return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+ int handled)
+{
+#ifdef KTR
+ VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+ handled ? "handled" : "unhandled",
+ exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+ int error, lastcpu;
+ struct vmxstate *vmxstate;
+ struct invvpid_desc invvpid_desc = { 0 };
+
+ vmxstate = &vmx->state[vcpu];
+ lastcpu = vmxstate->lastcpu;
+ vmxstate->lastcpu = curcpu;
+
+ if (lastcpu == curcpu) {
+ error = 0;
+ goto done;
+ }
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ if (error != 0)
+ goto done;
+
+ /*
+ * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ *
+ * We do this because this vcpu was executing on a different host
+ * cpu when it last ran. We do not track whether it invalidated
+ * mappings associated with its 'vpid' during that run. So we must
+ * assume that the mappings associated with 'vpid' on 'curcpu' are
+ * stale and invalidate them.
+ *
+ * Note that we incur this penalty only when the scheduler chooses to
+ * move the thread associated with this vcpu between host cpus.
+ *
+ * Note also that this will invalidate mappings tagged with 'vpid'
+ * for "all" EP4TAs.
+ */
+ if (vmxstate->vpid != 0) {
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ }
+done:
+ return (error);
+}
+
+static void
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+ int error;
+
+ error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+ if (error)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+ int error;
+ uint64_t info, interruptibility;
+
+ /* Bail out if no NMI requested */
+ if (!vm_nmi_pending(vmx->vm, vcpu))
+ return (0);
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_nmi: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & nmi_blocking_bits)
+ goto nmiblocked;
+
+ /*
+ * Inject the virtual NMI. The vector must be the NMI IDT entry
+ * or the VMCS entry check will fail.
+ */
+ info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+ info |= IDT_NMI;
+
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+ VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+ /* Clear the request */
+ vm_nmi_clear(vmx->vm, vcpu);
+ return (1);
+
+nmiblocked:
+ /*
+ * Set the NMI Window Exiting execution control so we can inject
+ * the virtual NMI as soon as blocking condition goes away.
+ */
+ vmx_set_nmi_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+ return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+ int error, vector;
+ uint64_t info, rflags, interruptibility;
+
+ const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+ /*
+ * If there is already an interrupt pending then just return.
+ *
+ * This could happen if an interrupt was injected on a prior
+ * VM entry but the actual entry into guest mode was aborted
+ * because of a pending AST.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return;
+
+ /*
+ * NMI injection has priority so deal with those first
+ */
+ if (vmx_inject_nmi(vmx, vcpu))
+ return;
+
+ /* Ask the local apic for a vector to inject */
+ vector = lapic_pending_intr(vmx->vm, vcpu);
+ if (vector < 0)
+ return;
+
+ if (vector < 32 || vector > 255)
+ panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+ /* Check RFLAGS.IF and the interruptibility state of the guest */
+ error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+ if ((rflags & PSL_I) == 0)
+ goto cantinject;
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & HWINTR_BLOCKED)
+ goto cantinject;
+
+ /* Inject the interrupt */
+ info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+ info |= vector;
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+ /* Update the Local APIC ISR */
+ lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+ VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+ return;
+
+cantinject:
+ /*
+ * Set the Interrupt Window Exiting execution control so we can inject
+ * the interrupt as soon as blocking condition goes away.
+ */
+ vmx_set_int_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ int error, cr, vmcs_guest_cr;
+ uint64_t regval, ones_mask, zeros_mask;
+ const struct vmxctx *vmxctx;
+
+ /* We only handle mov to %cr0 or %cr4 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ cr = exitqual & 0xf;
+ if (cr != 0 && cr != 4)
+ return (UNHANDLED);
+
+ vmxctx = &vmx->ctx[vcpu];
+
+ /*
+ * We must use vmwrite() directly here because vmcs_setreg() will
+ * call vmclear(vmcs) as a side-effect which we certainly don't want.
+ */
+ switch ((exitqual >> 8) & 0xf) {
+ case 0:
+ regval = vmxctx->guest_rax;
+ break;
+ case 1:
+ regval = vmxctx->guest_rcx;
+ break;
+ case 2:
+ regval = vmxctx->guest_rdx;
+ break;
+ case 3:
+ regval = vmxctx->guest_rbx;
+ break;
+ case 4:
+ error = vmread(VMCS_GUEST_RSP, &regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: "
+ "error %d reading guest rsp", error);
+ }
+ break;
+ case 5:
+ regval = vmxctx->guest_rbp;
+ break;
+ case 6:
+ regval = vmxctx->guest_rsi;
+ break;
+ case 7:
+ regval = vmxctx->guest_rdi;
+ break;
+ case 8:
+ regval = vmxctx->guest_r8;
+ break;
+ case 9:
+ regval = vmxctx->guest_r9;
+ break;
+ case 10:
+ regval = vmxctx->guest_r10;
+ break;
+ case 11:
+ regval = vmxctx->guest_r11;
+ break;
+ case 12:
+ regval = vmxctx->guest_r12;
+ break;
+ case 13:
+ regval = vmxctx->guest_r13;
+ break;
+ case 14:
+ regval = vmxctx->guest_r14;
+ break;
+ case 15:
+ regval = vmxctx->guest_r15;
+ break;
+ }
+
+ if (cr == 0) {
+ ones_mask = cr0_ones_mask;
+ zeros_mask = cr0_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR0;
+ } else {
+ ones_mask = cr4_ones_mask;
+ zeros_mask = cr4_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR4;
+ }
+ regval |= ones_mask;
+ regval &= ~zeros_mask;
+ error = vmwrite(vmcs_guest_cr, regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: error %d writing cr%d",
+ error, cr);
+ }
+
+ return (HANDLED);
+}
+
+static int
+vmx_ept_fault(struct vm *vm, int cpu,
+ uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+ uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+{
+ int read, write, error;
+
+ /* EPT violation on an instruction fetch doesn't make sense here */
+ if (ept_qual & EPT_VIOLATION_INST_FETCH)
+ return (UNHANDLED);
+
+ /* EPT violation must be a read fault or a write fault */
+ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+ write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+ if ((read | write) == 0)
+ return (UNHANDLED);
+
+ /*
+ * The EPT violation must have been caused by accessing a
+ * guest-physical address that is a translation of a guest-linear
+ * address.
+ */
+ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+ (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+ return (UNHANDLED);
+ }
+
+ /* Fetch, decode and emulate the faulting instruction */
+ if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
+ return (UNHANDLED);
+
+ if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
+ return (UNHANDLED);
+
+ /*
+ * Check if this is a local apic access
+ */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+ return (UNHANDLED);
+
+ error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ return (error ? UNHANDLED : HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+ int error, handled;
+ struct vmcs *vmcs;
+ struct vmxctx *vmxctx;
+ uint32_t eax, ecx, edx;
+ uint64_t qual, gla, gpa, cr3, intr_info;
+
+ handled = 0;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ qual = vmexit->u.vmx.exit_qualification;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+ switch (vmexit->u.vmx.exit_reason) {
+ case EXIT_REASON_CR_ACCESS:
+ handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+ break;
+ case EXIT_REASON_RDMSR:
+ ecx = vmxctx->guest_rcx;
+ error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_RDMSR;
+ vmexit->u.msr.code = ecx;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_WRMSR:
+ eax = vmxctx->guest_rax;
+ ecx = vmxctx->guest_rcx;
+ edx = vmxctx->guest_rdx;
+ error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+ (uint64_t)edx << 32 | eax);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_WRMSR;
+ vmexit->u.msr.code = ecx;
+ vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_HLT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+ /*
+ * If there is an event waiting to be injected then there is
+ * no need to 'hlt'.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
+ if (error)
+ panic("vmx_exit_process: vmread(intrinfo) %d", error);
+
+ if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
+ handled = 1;
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
+ } else
+ vmexit->exitcode = VM_EXITCODE_HLT;
+ break;
+ case EXIT_REASON_MTF:
+ vmexit->exitcode = VM_EXITCODE_MTRAP;
+ break;
+ case EXIT_REASON_PAUSE:
+ vmexit->exitcode = VM_EXITCODE_PAUSE;
+ break;
+ case EXIT_REASON_INTR_WINDOW:
+ vmx_clear_int_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+ /* FALLTHRU */
+ case EXIT_REASON_EXT_INTR:
+ /*
+ * External interrupts serve only to cause VM exits and allow
+ * the host interrupt handler to run.
+ *
+ * If this external interrupt triggers a virtual interrupt
+ * to a VM, then that state will be recorded by the
+ * host interrupt handler in the VM's softc. We will inject
+ * this virtual interrupt during the subsequent VM enter.
+ */
+
+ /*
+ * This is special. We want to treat this as an 'handled'
+ * VM-exit but not increment the instruction pointer.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+ return (1);
+ case EXIT_REASON_NMI_WINDOW:
+ /* Exit to allow the pending virtual NMI to be injected */
+ vmx_clear_nmi_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+ return (1);
+ case EXIT_REASON_INOUT:
+ vmexit->exitcode = VM_EXITCODE_INOUT;
+ vmexit->u.inout.bytes = (qual & 0x7) + 1;
+ vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+ vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+ vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+ vmexit->u.inout.port = (uint16_t)(qual >> 16);
+ vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+ break;
+ case EXIT_REASON_CPUID:
+ handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ gla = vmcs_gla();
+ gpa = vmcs_gpa();
+ cr3 = vmcs_guest_cr3();
+ handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+ vmexit->rip, vmexit->inst_length,
+ cr3, qual, &vmexit->u.paging.vie);
+ if (!handled) {
+ vmexit->exitcode = VM_EXITCODE_PAGING;
+ vmexit->u.paging.gpa = gpa;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (handled) {
+ /*
+ * It is possible that control is returned to userland
+ * even though we were able to handle the VM exit in the
+ * kernel.
+ *
+ * In such a case we want to make sure that the userland
+ * restarts guest execution at the instruction *after*
+ * the one we just processed. Therefore we update the
+ * guest rip in the VMCS and in 'vmexit'.
+ */
+ vm_exit_update_rip(vmexit);
+ vmexit->rip += vmexit->inst_length;
+ vmexit->inst_length = 0;
+
+ /*
+ * Special case for spinning up an AP - exit to userspace to
+ * give the controlling process a chance to intercept and
+ * spin up a thread for the AP.
+ */
+ if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+ handled = 0;
+ } else {
+ if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+ /*
+ * If this VM exit was not claimed by anybody then
+ * treat it as a generic VMX exit.
+ */
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.error = 0;
+ } else {
+ /*
+ * The exitcode and collateral have been populated.
+ * The VM exit will be processed further in userland.
+ */
+ }
+ }
+ return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+ int error, vie, rc, handled, astpending;
+ uint32_t exit_reason;
+ struct vmx *vmx;
+ struct vmxctx *vmxctx;
+ struct vmcs *vmcs;
+ struct vm_exit *vmexit;
+
+ vmx = arg;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ vmxctx->launched = 0;
+
+ astpending = 0;
+ vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+ /*
+ * XXX Can we avoid doing this every time we do a vm run?
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * XXX
+ * We do this every time because we may setup the virtual machine
+ * from a different process than the one that actually runs it.
+ *
+ * If the life of a virtual machine was spent entirely in the context
+ * of a single process we could do this once in vmcs_set_defaults().
+ */
+ if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+ panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+ if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+ if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+ panic("vmx_run: error %d setting up pcpu defaults", error);
+
+ do {
+ lapic_timer_tick(vmx->vm, vcpu);
+ vmx_inject_interrupts(vmx, vcpu);
+ vmx_run_trace(vmx, vcpu);
+ rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+ vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ if (vmxctx->launched == 0) {
+ vmxctx->launched = 1;
+ vmx_launch(vmxctx);
+ } else
+ vmx_resume(vmxctx);
+ panic("vmx_launch/resume should not return");
+ break;
+ case VMX_RETURN_LONGJMP:
+ break; /* vm exit */
+ case VMX_RETURN_AST:
+ astpending = 1;
+ break;
+ case VMX_RETURN_VMRESUME:
+ vie = vmcs_instruction_error();
+ if (vmxctx->launch_error == VM_FAIL_INVALID ||
+ vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+ printf("vmresume error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+ goto err_exit;
+ }
+ vmx_launch(vmxctx); /* try to launch the guest */
+ panic("vmx_launch should not return");
+ break;
+ case VMX_RETURN_VMLAUNCH:
+ vie = vmcs_instruction_error();
+#if 1
+ printf("vmlaunch error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+#endif
+ goto err_exit;
+ default:
+ panic("vmx_setjmp returned %d", rc);
+ }
+
+ /* enable interrupts */
+ enable_intr();
+
+ /* collect some basic information for VM exit processing */
+ vmexit->rip = rip = vmcs_guest_rip();
+ vmexit->inst_length = vmexit_instruction_length();
+ vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+ vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+ if (astpending) {
+ handled = 1;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+ vmx_astpending_trace(vmx, vcpu, rip);
+ break;
+ }
+
+ handled = vmx_exit_process(vmx, vcpu, vmexit);
+ vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+ } while (handled);
+
+ /*
+ * If a VM exit has been handled then the exitcode must be BOGUS
+ * If a VM exit is not handled then the exitcode must not be BOGUS
+ */
+ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+ (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+ panic("Mismatch between handled (%d) and exitcode (%d)",
+ handled, vmexit->exitcode);
+ }
+
+ VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+ /*
+ * XXX
+ * We need to do this to ensure that any VMCS state cached by the
+ * processor is flushed to memory. We need to do this in case the
+ * VM moves to a different cpu the next time it runs.
+ *
+ * Can we avoid doing this?
+ */
+ VMCLEAR(vmcs);
+ return (0);
+
+err_exit:
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.exit_reason = (uint32_t)-1;
+ vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+ vmexit->u.vmx.error = vie;
+ VMCLEAR(vmcs);
+ return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+ int error;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXXSMP we also need to clear the VMCS active on the other vcpus.
+ */
+ error = vmclear(&vmx->vmcs[0]);
+ if (error != 0)
+ panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+ ept_vmcleanup(vmx);
+ free(vmx, M_VMX);
+
+ return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_RAX:
+ return (&vmxctx->guest_rax);
+ case VM_REG_GUEST_RBX:
+ return (&vmxctx->guest_rbx);
+ case VM_REG_GUEST_RCX:
+ return (&vmxctx->guest_rcx);
+ case VM_REG_GUEST_RDX:
+ return (&vmxctx->guest_rdx);
+ case VM_REG_GUEST_RSI:
+ return (&vmxctx->guest_rsi);
+ case VM_REG_GUEST_RDI:
+ return (&vmxctx->guest_rdi);
+ case VM_REG_GUEST_RBP:
+ return (&vmxctx->guest_rbp);
+ case VM_REG_GUEST_R8:
+ return (&vmxctx->guest_r8);
+ case VM_REG_GUEST_R9:
+ return (&vmxctx->guest_r9);
+ case VM_REG_GUEST_R10:
+ return (&vmxctx->guest_r10);
+ case VM_REG_GUEST_R11:
+ return (&vmxctx->guest_r11);
+ case VM_REG_GUEST_R12:
+ return (&vmxctx->guest_r12);
+ case VM_REG_GUEST_R13:
+ return (&vmxctx->guest_r13);
+ case VM_REG_GUEST_R14:
+ return (&vmxctx->guest_r14);
+ case VM_REG_GUEST_R15:
+ return (&vmxctx->guest_r15);
+ default:
+ break;
+ }
+ return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *retval = *regp;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *regp = val;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+ struct vmx *vmx = arg;
+
+ if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ uint64_t ctls;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXX Allow caller to set contents of the guest registers saved in
+ * the 'vmxctx' even though the vcpu might be running. We need this
+ * specifically to support the rdmsr emulation that will set the
+ * %eax and %edx registers during vm exit processing.
+ */
+ if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+ if (error == 0) {
+ /*
+ * If the "load EFER" VM-entry control is 1 then the
+ * value of EFER.LMA must be identical to "IA-32e mode guest"
+ * bit in the VM-entry control.
+ */
+ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+ (reg == VM_REG_GUEST_EFER)) {
+ vmcs_getreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+ if (val & EFER_LMA)
+ ctls |= VM_ENTRY_GUEST_LMA;
+ else
+ ctls &= ~VM_ENTRY_GUEST_LMA;
+ vmcs_setreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+ }
+ }
+
+ return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+ int code_valid)
+{
+ int error;
+ uint64_t info;
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+ static uint32_t type_map[VM_EVENT_MAX] = {
+ 0x1, /* VM_EVENT_NONE */
+ 0x0, /* VM_HW_INTR */
+ 0x2, /* VM_NMI */
+ 0x3, /* VM_HW_EXCEPTION */
+ 0x4, /* VM_SW_INTR */
+ 0x5, /* VM_PRIV_SW_EXCEPTION */
+ 0x6, /* VM_SW_EXCEPTION */
+ };
+
+ /*
+ * If there is already an exception pending to be delivered to the
+ * vcpu then just return.
+ */
+ error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
+ if (error)
+ return (error);
+
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return (EAGAIN);
+
+ info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+ info |= VMCS_INTERRUPTION_INFO_VALID;
+ error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+ if (error != 0)
+ return (error);
+
+ if (code_valid) {
+ error = vmcs_setreg(vmcs,
+ VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+ code);
+ }
+ return (error);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+ struct vmx *vmx = arg;
+ int vcap;
+ int ret;
+
+ ret = ENOENT;
+
+ vcap = vmx->cap[vcpu].set;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit)
+ ret = 0;
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit)
+ ret = 0;
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap)
+ ret = 0;
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest)
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (ret == 0)
+ *retval = (vcap & (1 << type)) ? 1 : 0;
+
+ return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+ uint32_t baseval;
+ uint32_t *pptr;
+ int error;
+ int flag;
+ int reg;
+ int retval;
+
+ retval = ENOENT;
+ pptr = NULL;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_HLT_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_MTF;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_PAUSE_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest) {
+ retval = 0;
+ baseval = procbased_ctls2;
+ flag = PROCBASED2_UNRESTRICTED_GUEST;
+ reg = VMCS_SEC_PROC_BASED_CTLS;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retval == 0) {
+ if (val) {
+ baseval |= flag;
+ } else {
+ baseval &= ~flag;
+ }
+ VMPTRLD(vmcs);
+ error = vmwrite(reg, baseval);
+ VMCLEAR(vmcs);
+
+ if (error) {
+ retval = error;
+ } else {
+ /*
+ * Update optional stored flags, and record
+ * setting
+ */
+ if (pptr != NULL) {
+ *pptr = baseval;
+ }
+
+ if (val) {
+ vmx->cap[vcpu].set |= (1 << type);
+ } else {
+ vmx->cap[vcpu].set &= ~(1 << type);
+ }
+ }
+ }
+
+ return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+ vmx_init,
+ vmx_cleanup,
+ vmx_vminit,
+ vmx_run,
+ vmx_vmcleanup,
+ ept_vmmmap_set,
+ ept_vmmmap_get,
+ vmx_getreg,
+ vmx_setreg,
+ vmx_getdesc,
+ vmx_setdesc,
+ vmx_inject,
+ vmx_getcap,
+ vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..c7cd567
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define _VMX_H_
+
+#include "vmcs.h"
+
+#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
+
+struct vmxctx {
+ register_t tmpstk[32]; /* vmx_return() stack */
+ register_t tmpstktop;
+
+ register_t guest_rdi; /* Guest state */
+ register_t guest_rsi;
+ register_t guest_rdx;
+ register_t guest_rcx;
+ register_t guest_r8;
+ register_t guest_r9;
+ register_t guest_rax;
+ register_t guest_rbx;
+ register_t guest_rbp;
+ register_t guest_r10;
+ register_t guest_r11;
+ register_t guest_r12;
+ register_t guest_r13;
+ register_t guest_r14;
+ register_t guest_r15;
+ register_t guest_cr2;
+
+ register_t host_r15; /* Host state */
+ register_t host_r14;
+ register_t host_r13;
+ register_t host_r12;
+ register_t host_rbp;
+ register_t host_rsp;
+ register_t host_rbx;
+ register_t host_rip;
+ /*
+ * XXX todo debug registers and fpu state
+ */
+
+ int launched; /* vmcs launch state */
+ int launch_error;
+};
+
+struct vmxcap {
+ int set;
+ uint32_t proc_ctls;
+};
+
+struct vmxstate {
+ int lastcpu; /* host cpu that this 'vcpu' last ran on */
+ uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+ pml4_entry_t pml4ept[NPML4EPG];
+ struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
+ char msr_bitmap[PAGE_SIZE];
+ struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+ struct vmxctx ctx[VM_MAXCPU];
+ struct vmxcap cap[VM_MAXCPU];
+ struct vmxstate state[VM_MAXCPU];
+ struct vm *vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define VMX_RETURN_DIRECT 0
+#define VMX_RETURN_LONGJMP 1
+#define VMX_RETURN_VMRESUME 2
+#define VMX_RETURN_VMLAUNCH 3
+#define VMX_RETURN_AST 4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int vmx_setjmp(struct vmxctx *ctx);
+void vmx_longjmp(void); /* returns via vmx_setjmp */
+void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+
+u_long vmx_fix_cr0(u_long cr0);
+u_long vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define _VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define PINBASED_EXTINT_EXITING (1 << 0)
+#define PINBASED_NMI_EXITING (1 << 3)
+#define PINBASED_VIRTUAL_NMI (1 << 5)
+#define PINBASED_PREMPTION_TIMER (1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
+#define PROCBASED_TSC_OFFSET (1 << 3)
+#define PROCBASED_HLT_EXITING (1 << 7)
+#define PROCBASED_INVLPG_EXITING (1 << 9)
+#define PROCBASED_MWAIT_EXITING (1 << 10)
+#define PROCBASED_RDPMC_EXITING (1 << 11)
+#define PROCBASED_RDTSC_EXITING (1 << 12)
+#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
+#define PROCBASED_CR3_STORE_EXITING (1 << 16)
+#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
+#define PROCBASED_CR8_STORE_EXITING (1 << 20)
+#define PROCBASED_USE_TPR_SHADOW (1 << 21)
+#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
+#define PROCBASED_MOV_DR_EXITING (1 << 23)
+#define PROCBASED_IO_EXITING (1 << 24)
+#define PROCBASED_IO_BITMAPS (1 << 25)
+#define PROCBASED_MTF (1 << 27)
+#define PROCBASED_MSR_BITMAPS (1 << 28)
+#define PROCBASED_MONITOR_EXITING (1 << 29)
+#define PROCBASED_PAUSE_EXITING (1 << 30)
+#define PROCBASED_SECONDARY_CONTROLS (1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
+#define PROCBASED2_ENABLE_EPT (1 << 1)
+#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
+#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
+#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
+#define PROCBASED2_ENABLE_VPID (1 << 5)
+#define PROCBASED2_WBINVD_EXITING (1 << 6)
+#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
+#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
+
+/* VM Exit Controls */
+#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
+#define VM_EXIT_HOST_LMA (1 << 9)
+#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
+#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
+#define VM_EXIT_SAVE_PAT (1 << 18)
+#define VM_EXIT_LOAD_PAT (1 << 19)
+#define VM_EXIT_SAVE_EFER (1 << 20)
+#define VM_EXIT_LOAD_EFER (1 << 21)
+#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
+
+/* VM Entry Controls */
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
+#define VM_ENTRY_GUEST_LMA (1 << 9)
+#define VM_ENTRY_INTO_SMM (1 << 10)
+#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
+#define VM_ENTRY_LOAD_PAT (1 << 14)
+#define VM_ENTRY_LOAD_EFER (1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..2e66443
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CPUFUNC_H_
+#define _VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ * error
+ * VMsucceed 0
+ * VMFailInvalid 1
+ * VMFailValid 2 see also VMCS VM-Instruction Error Field
+ */
+#define VM_SUCCESS 0
+#define VM_FAIL_INVALID 1
+#define VM_FAIL_VALID 2
+#define VMX_SET_ERROR_CODE \
+ " jnc 1f;" \
+ " mov $1, %[error];" /* CF: error = 1 */ \
+ " jmp 3f;" \
+ "1: jnz 2f;" \
+ " mov $2, %[error];" /* ZF: error = 2 */ \
+ " jmp 3f;" \
+ "2: mov $0, %[error];" \
+ "3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(region);
+ __asm __volatile("vmxon %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+
+ return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmclear %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+ __asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+ __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmptrld %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+ int error;
+
+ __asm __volatile("vmwrite %[val], %[reg];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [val] "r" (val), [reg] "r" (reg)
+ : "memory");
+
+ return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+ int error;
+
+ __asm __volatile("vmread %[r], %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [r] "r" (r), [addr] "m" (*addr)
+ : "memory");
+
+ return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+ int err;
+
+ err = vmclear(vmcs);
+ if (err != 0)
+ panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+ critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+ int err;
+
+ critical_enter();
+
+ err = vmptrld(vmcs);
+ if (err != 0)
+ panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define INVVPID_TYPE_ADDRESS 0UL
+#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
+#define INVVPID_TYPE_ALL_CONTEXTS 2UL
+
+struct invvpid_desc {
+ uint16_t vpid;
+ uint16_t _res1;
+ uint32_t _res2;
+ uint64_t linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+ int error;
+
+ __asm __volatile("invvpid %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invvpid error %d", error);
+}
+
+#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
+#define INVEPT_TYPE_ALL_CONTEXTS 2UL
+struct invept_desc {
+ uint64_t eptp;
+ uint64_t _res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+ int error;
+
+ __asm __volatile("invept %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..823a05d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS, VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
+ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..2aba63c
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+ if (msr_val & (1UL << (bitpos + 32)))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+ if ((msr_val & (1UL << bitpos)) == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+ return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval)
+{
+ int i;
+ uint64_t val, trueval;
+ boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+ /* We cannot ask the same bit to be set to both '1' and '0' */
+ if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+ return (EINVAL);
+
+ if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+ true_ctls_avail = TRUE;
+ else
+ true_ctls_avail = FALSE;
+
+ val = rdmsr(ctl_reg);
+ if (true_ctls_avail)
+ trueval = rdmsr(true_ctl_reg); /* step c */
+ else
+ trueval = val; /* step a */
+
+ for (i = 0; i < 32; i++) {
+ one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+ zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+ KASSERT(one_allowed || zero_allowed,
+ ("invalid zero/one setting for bit %d of ctl 0x%0x, "
+ "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+ if (zero_allowed && !one_allowed) { /* b(i),c(i) */
+ if (ones_mask & (1 << i))
+ return (EINVAL);
+ *retval &= ~(1 << i);
+ } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
+ if (zeros_mask & (1 << i))
+ return (EINVAL);
+ *retval |= 1 << i;
+ } else {
+ if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
+ *retval &= ~(1 << i);
+ else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+ *retval |= 1 << i;
+ else if (!true_ctls_avail)
+ *retval &= ~(1 << i); /* b(iii) */
+ else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+ *retval &= ~(1 << i);
+ else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+ *retval |= 1 << i;
+ else {
+ panic("vmx_set_ctlreg: unable to determine "
+ "correct value of ctl bit %d for msr "
+ "0x%0x and true msr 0x%0x", i, ctl_reg,
+ true_ctl_reg);
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+ memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+ int byte, bit;
+
+ if (msr <= 0x00001FFF)
+ byte = msr / 8;
+ else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+ byte = 1024 + (msr - 0xC0000000) / 8;
+ else
+ return (EINVAL);
+
+ bit = msr & 0x7;
+
+ if (access & MSR_BITMAP_ACCESS_READ)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ byte += 2048;
+ if (access & MSR_BITMAP_ACCESS_WRITE)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define _VMX_MSR_H_
+
+#define MSR_VMX_BASIC 0x480
+#define MSR_VMX_EPT_VPID_CAP 0x48C
+
+#define MSR_VMX_PROCBASED_CTLS 0x482
+#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
+
+#define MSR_VMX_PINBASED_CTLS 0x481
+#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
+
+#define MSR_VMX_PROCBASED_CTLS2 0x48B
+
+#define MSR_VMX_EXIT_CTLS 0x483
+#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
+
+#define MSR_VMX_ENTRY_CTLS 0x484
+#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
+
+#define MSR_VMX_CR0_FIXED0 0x486
+#define MSR_VMX_CR0_FIXED1 0x487
+
+#define MSR_VMX_CR4_FIXED0 0x488
+#define MSR_VMX_CR4_FIXED1 0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define MSR_BITMAP_ACCESS_NONE 0x0
+#define MSR_BITMAP_ACCESS_READ 0x1
+#define MSR_BITMAP_ACCESS_WRITE 0x2
+#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void msr_bitmap_initialize(char *bitmap);
+int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4ba582a
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define VMX_DISABLE_INTERRUPTS cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#define VMX_CHECK_AST \
+ movq PCPU(CURTHREAD),%rax; \
+ testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \
+ je 9f; \
+ movq $VMX_RETURN_AST,%rsi; \
+ movq %rdi,%rsp; \
+ addq $VMXCTX_TMPSTKTOP,%rsp; \
+ callq vmx_return; \
+9:
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define VMX_GUEST_RESTORE \
+ movq %rdi,%rsp; \
+ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
+ movq %rsi,%cr2; \
+ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
+ movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
+ movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
+ movq VMXCTX_GUEST_R8(%rdi),%r8; \
+ movq VMXCTX_GUEST_R9(%rdi),%r9; \
+ movq VMXCTX_GUEST_RAX(%rdi),%rax; \
+ movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
+ movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
+ movq VMXCTX_GUEST_R10(%rdi),%r10; \
+ movq VMXCTX_GUEST_R11(%rdi),%r11; \
+ movq VMXCTX_GUEST_R12(%rdi),%r12; \
+ movq VMXCTX_GUEST_R13(%rdi),%r13; \
+ movq VMXCTX_GUEST_R14(%rdi),%r14; \
+ movq VMXCTX_GUEST_R15(%rdi),%r15; \
+ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define VM_INSTRUCTION_ERROR(reg) \
+ jnc 1f; \
+ movl $VM_FAIL_INVALID,reg; /* CF is set */ \
+ jmp 3f; \
+1: jnz 2f; \
+ movl $VM_FAIL_VALID,reg; /* ZF is set */ \
+ jmp 3f; \
+2: movl $VM_SUCCESS,reg; \
+3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+ .text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+ movq (%rsp),%rax /* return address */
+ movq %r15,VMXCTX_HOST_R15(%rdi)
+ movq %r14,VMXCTX_HOST_R14(%rdi)
+ movq %r13,VMXCTX_HOST_R13(%rdi)
+ movq %r12,VMXCTX_HOST_R12(%rdi)
+ movq %rbp,VMXCTX_HOST_RBP(%rdi)
+ movq %rsp,VMXCTX_HOST_RSP(%rdi)
+ movq %rbx,VMXCTX_HOST_RBX(%rdi)
+ movq %rax,VMXCTX_HOST_RIP(%rdi)
+
+ /*
+ * XXX save host debug registers
+ */
+ movl $VMX_RETURN_DIRECT,%eax
+ ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+ /* Restore host context. */
+ movq VMXCTX_HOST_R15(%rdi),%r15
+ movq VMXCTX_HOST_R14(%rdi),%r14
+ movq VMXCTX_HOST_R13(%rdi),%r13
+ movq VMXCTX_HOST_R12(%rdi),%r12
+ movq VMXCTX_HOST_RBP(%rdi),%rbp
+ movq VMXCTX_HOST_RSP(%rdi),%rsp
+ movq VMXCTX_HOST_RBX(%rdi),%rbx
+ movq VMXCTX_HOST_RIP(%rdi),%rax
+ movq %rax,(%rsp) /* return address */
+
+ /*
+ * XXX restore host debug registers
+ */
+ movl %esi,%eax
+ ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+ /*
+ * Save guest state that is not automatically saved in the vmcs.
+ */
+ movq %rdi,VMXCTX_GUEST_RDI(%rsp)
+ movq %rsi,VMXCTX_GUEST_RSI(%rsp)
+ movq %rdx,VMXCTX_GUEST_RDX(%rsp)
+ movq %rcx,VMXCTX_GUEST_RCX(%rsp)
+ movq %r8,VMXCTX_GUEST_R8(%rsp)
+ movq %r9,VMXCTX_GUEST_R9(%rsp)
+ movq %rax,VMXCTX_GUEST_RAX(%rsp)
+ movq %rbx,VMXCTX_GUEST_RBX(%rsp)
+ movq %rbp,VMXCTX_GUEST_RBP(%rsp)
+ movq %r10,VMXCTX_GUEST_R10(%rsp)
+ movq %r11,VMXCTX_GUEST_R11(%rsp)
+ movq %r12,VMXCTX_GUEST_R12(%rsp)
+ movq %r13,VMXCTX_GUEST_R13(%rsp)
+ movq %r14,VMXCTX_GUEST_R14(%rsp)
+ movq %r15,VMXCTX_GUEST_R15(%rsp)
+
+ movq %cr2,%rdi
+ movq %rdi,VMXCTX_GUEST_CR2(%rsp)
+
+ movq %rsp,%rdi
+ movq $VMX_RETURN_LONGJMP,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmresume
+
+ /*
+ * Capture the reason why vmresume failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMRESUME,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmlaunch
+
+ /*
+ * Capture the reason why vmlaunch failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMLAUNCH,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..ef0e9bc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,677 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+ volatile uint32_t version;
+ volatile uint32_t res0;
+ volatile uint64_t cap;
+ volatile uint64_t ext_cap;
+ volatile uint32_t gcr;
+ volatile uint32_t gsr;
+ volatile uint64_t rta;
+ volatile uint64_t ccr;
+};
+
+#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
+#define VTD_CAP_ND(cap) ((cap) & 0x7)
+#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
+#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
+#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
+
+#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
+#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
+
+#define VTD_GCR_WBF (1 << 27)
+#define VTD_GCR_SRTP (1 << 30)
+#define VTD_GCR_TE (1 << 31)
+
+#define VTD_GSR_WBFS (1 << 27)
+#define VTD_GSR_RTPS (1 << 30)
+#define VTD_GSR_TES (1 << 31)
+
+#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
+#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
+
+#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
+#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
+#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
+#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
+#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
+#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
+#define VTD_IIR_DOMAIN_P 32
+
+#define VTD_ROOT_PRESENT 0x1
+#define VTD_CTX_PRESENT 0x1
+#define VTD_CTX_TT_ALL (1UL << 2)
+
+#define VTD_PTE_RD (1UL << 0)
+#define VTD_PTE_WR (1UL << 1)
+#define VTD_PTE_SUPERPAGE (1UL << 7)
+#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
+
+struct domain {
+ uint64_t *ptp; /* first level page table page */
+ int pt_levels; /* number of page table levels */
+ int addrwidth; /* 'AW' field in context entry */
+ int spsmask; /* supported super page sizes */
+ u_int id; /* domain id */
+ vm_paddr_t maxaddr; /* highest address to be mapped */
+ SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define DRHD_MAX_UNITS 8
+static int drhd_num;
+static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
+static int max_domains;
+typedef int (*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+ int units, nlbus;
+ uint16_t did, vid;
+ uint32_t miscsts, vtbar;
+
+ const int bus = 0;
+ const int slot = 20;
+ const int func = 0;
+
+ units = 0;
+
+ vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+ did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+ if (vid != 0x8086 || did != 0x342E)
+ goto done;
+
+ /*
+ * Check if this is a dual IOH configuration.
+ */
+ miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+ if (miscsts & (1 << 25))
+ nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+ else
+ nlbus = -1;
+
+ vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in legacy IOH is disabled!\n");
+
+ if (nlbus != -1) {
+ vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in non-legacy IOH is disabled!\n");
+ }
+done:
+ return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+ tylersburg_vtd_ident,
+ NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+ int nd;
+
+ nd = VTD_CAP_ND(vtdmap->cap);
+
+ switch (nd) {
+ case 0:
+ return (16);
+ case 1:
+ return (64);
+ case 2:
+ return (256);
+ case 3:
+ return (1024);
+ case 4:
+ return (4 * 1024);
+ case 5:
+ return (16 * 1024);
+ case 6:
+ return (64 * 1024);
+ default:
+ panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+ }
+}
+
+static u_int
+domain_id(void)
+{
+ u_int id;
+ struct domain *dom;
+
+ /* Skip domain id 0 - it is reserved when Caching Mode field is set */
+ for (id = 1; id < max_domains; id++) {
+ SLIST_FOREACH(dom, &domhead, next) {
+ if (dom->id == id)
+ break;
+ }
+ if (dom == NULL)
+ break; /* found it */
+ }
+
+ if (id >= max_domains)
+ panic("domain ids exhausted");
+
+ return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+ if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+ pmap_invalidate_cache();
+
+ if (VTD_CAP_RWBF(vtdmap->cap)) {
+ vtdmap->gcr = VTD_GCR_WBF;
+ while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+ ;
+ }
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+ vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+ while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+ ;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+ int offset;
+ volatile uint64_t *iotlb_reg, val;
+
+ vtd_wbflush(vtdmap);
+
+ offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+ iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+
+ *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+ VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+ while (1) {
+ val = *iotlb_reg;
+ if ((val & VTD_IIR_IVT) == 0)
+ break;
+ }
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = VTD_GCR_TE;
+ while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+ ;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = 0;
+ while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+ ;
+}
+
+static int
+vtd_init(void)
+{
+ int i, units;
+ struct vtdmap *vtdmap;
+ vm_paddr_t ctx_paddr;
+
+ for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+ units = (*drhd_ident_funcs[i])();
+ if (units > 0)
+ break;
+ }
+
+ if (units <= 0)
+ return (ENXIO);
+
+ drhd_num = units;
+ vtdmap = vtdmaps[0];
+
+ if (VTD_CAP_CM(vtdmap->cap) != 0)
+ panic("vtd_init: invalid caching mode");
+
+ max_domains = vtd_max_domains(vtdmap);
+
+ /*
+ * Set up the root-table to point to the context-entry tables
+ */
+ for (i = 0; i < 256; i++) {
+ ctx_paddr = vtophys(ctx_tables[i]);
+ if (ctx_paddr & PAGE_MASK)
+ panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+ root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+ }
+
+ return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_wbflush(vtdmap);
+
+ /* Update the root table address */
+ vtdmap->rta = vtophys(root_table);
+ vtdmap->gcr = VTD_GCR_SRTP;
+ while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+ ;
+
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+
+ vtd_translation_enable(vtdmap);
+ }
+}
+
+static void
+vtd_disable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_translation_disable(vtdmap);
+ }
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+ int idx;
+ uint64_t *ctxp;
+ struct domain *dom = arg;
+ vm_paddr_t pt_paddr;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ vtdmap = vtdmaps[0];
+ ctxp = ctx_tables[bus];
+ pt_paddr = vtophys(dom->ptp);
+ idx = (slot << 3 | func) * 2;
+
+ if (ctxp[idx] & VTD_CTX_PRESENT) {
+ panic("vtd_add_device: device %d/%d/%d is already owned by "
+ "domain %d", bus, slot, func,
+ (uint16_t)(ctxp[idx + 1] >> 8));
+ }
+
+ /*
+ * Order is important. The 'present' bit is set only after all fields
+ * of the context pointer are initialized.
+ */
+ ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+ if (VTD_ECAP_DI(vtdmap->ext_cap))
+ ctxp[idx] = VTD_CTX_TT_ALL;
+ else
+ ctxp[idx] = 0;
+
+ ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+ /*
+ * 'Not Present' entries are not cached in either the Context Cache
+ * or in the IOTLB, so there is no need to invalidate either of them.
+ */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+ int i, idx;
+ uint64_t *ctxp;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ ctxp = ctx_tables[bus];
+ idx = (slot << 3 | func) * 2;
+
+ /*
+ * Order is important. The 'present' bit is must be cleared first.
+ */
+ ctxp[idx] = 0;
+ ctxp[idx + 1] = 0;
+
+ /*
+ * Invalidate the Context Cache and the IOTLB.
+ *
+ * XXX use device-selective invalidation for Context Cache
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+#define CREATE_MAPPING 0
+#define REMOVE_MAPPING 1
+
+static uint64_t
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+ int remove)
+{
+ struct domain *dom;
+ int i, spshift, ptpshift, ptpindex, nlevels;
+ uint64_t spsize, *ptp;
+
+ dom = arg;
+ ptpindex = 0;
+ ptpshift = 0;
+
+ if (gpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+ if (hpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+ if (len & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - supported super page size
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = 48;
+ for (i = 3; i >= 0; i--) {
+ spsize = 1UL << spshift;
+ if ((dom->spsmask & (1 << i)) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ (len >= spsize)) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ ptp = dom->ptp;
+ nlevels = dom->pt_levels;
+ while (--nlevels >= 0) {
+ ptpshift = 12 + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift) {
+ break;
+ }
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create a downstream page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+ }
+
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+ panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+ /*
+ * Update the 'gpa' -> 'hpa' mapping
+ */
+ if (remove) {
+ ptp[ptpindex] = 0;
+ } else {
+ ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ /*
+ * Invalidate the IOTLB.
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+ struct domain *dom;
+ vm_paddr_t addr;
+ int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+ struct vtdmap *vtdmap;
+
+ if (drhd_num <= 0)
+ panic("vtd_create_domain: no dma remapping hardware available");
+
+ vtdmap = vtdmaps[0];
+
+ /*
+ * Calculate AGAW.
+ * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+ */
+ addr = 0;
+ for (gaw = 0; addr < maxaddr; gaw++)
+ addr = 1ULL << gaw;
+
+ res = (gaw - 12) % 9;
+ if (res == 0)
+ agaw = gaw;
+ else
+ agaw = gaw + 9 - res;
+
+ if (agaw > 64)
+ agaw = 64;
+
+ /*
+ * Select the smallest Supported AGAW and the corresponding number
+ * of page table levels.
+ */
+ pt_levels = 2;
+ sagaw = 30;
+ addrwidth = 0;
+ tmp = VTD_CAP_SAGAW(vtdmap->cap);
+ for (i = 0; i < 5; i++) {
+ if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+ break;
+ pt_levels++;
+ addrwidth++;
+ sagaw += 9;
+ if (sagaw > 64)
+ sagaw = 64;
+ }
+
+ if (i >= 5) {
+ panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+ VTD_CAP_SAGAW(vtdmap->cap), agaw);
+ }
+
+ dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+ dom->pt_levels = pt_levels;
+ dom->addrwidth = addrwidth;
+ dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+ dom->id = domain_id();
+ dom->maxaddr = maxaddr;
+ dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+ if ((uintptr_t)dom->ptp & PAGE_MASK)
+ panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+ SLIST_INSERT_HEAD(&domhead, dom, next);
+
+ return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+ int i;
+ uint64_t *nlp;
+
+ if (level > 1) {
+ for (i = 0; i < 512; i++) {
+ if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+ continue;
+ if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+ continue;
+ nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+ vtd_free_ptp(nlp, level - 1);
+ }
+ }
+
+ bzero(ptp, PAGE_SIZE);
+ free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+ struct domain *dom;
+
+ dom = arg;
+
+ SLIST_REMOVE(&domhead, dom, domain, next);
+ vtd_free_ptp(dom->ptp, dom->pt_levels);
+ free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+ vtd_init,
+ vtd_cleanup,
+ vtd_enable,
+ vtd_disable,
+ vtd_create_domain,
+ vtd_destroy_domain,
+ vtd_create_mapping,
+ vtd_remove_mapping,
+ vtd_add_device,
+ vtd_remove_device,
+ vtd_invalidate_tlb,
+};
OpenPOWER on IntegriCloud