summaryrefslogtreecommitdiffstats
path: root/sys/amd64
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2013-01-19 04:18:52 +0000
committerneel <neel@FreeBSD.org>2013-01-19 04:18:52 +0000
commit363335d53e3c955602378aa434d0054c48a6e0d6 (patch)
tree5af6fe77acd4da3002c907484fd64133deb95c8d /sys/amd64
parent3600c83b820e00959d61600e67e9dcb32ef6b518 (diff)
parentdde8bf641fc7c8e9541167cd7c01523973d0b569 (diff)
downloadFreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.zip
FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.tar.gz
Merge projects/bhyve to head.
'bhyve' was developed by grehan@ and myself at NetApp (thanks!). Special thanks to Peter Snyder, Joe Caradonna and Michael Dexter for their support and encouragement. Obtained from: NetApp
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/include/vmm.h293
-rw-r--r--sys/amd64/include/vmm_dev.h215
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h113
-rw-r--r--sys/amd64/vmm/amd/amdv.c265
-rw-r--r--sys/amd64/vmm/intel/ept.c392
-rw-r--r--sys/amd64/vmm/intel/ept.h43
-rw-r--r--sys/amd64/vmm/intel/vmcs.c551
-rw-r--r--sys/amd64/vmm/intel/vmcs.h338
-rw-r--r--sys/amd64/vmm/intel/vmx.c1845
-rw-r--r--sys/amd64/vmm/intel/vmx.h120
-rw-r--r--sys/amd64/vmm/intel/vmx_controls.h92
-rw-r--r--sys/amd64/vmm/intel/vmx_cpufunc.h218
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c89
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.c172
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.h78
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S246
-rw-r--r--sys/amd64/vmm/intel/vtd.c677
-rw-r--r--sys/amd64/vmm/io/iommu.c277
-rw-r--r--sys/amd64/vmm/io/iommu.h75
-rw-r--r--sys/amd64/vmm/io/ppt.c610
-rw-r--r--sys/amd64/vmm/io/ppt.h41
-rw-r--r--sys/amd64/vmm/io/vdev.c270
-rw-r--r--sys/amd64/vmm/io/vdev.h84
-rw-r--r--sys/amd64/vmm/io/vlapic.c901
-rw-r--r--sys/amd64/vmm/io/vlapic.h111
-rw-r--r--sys/amd64/vmm/vmm.c1022
-rw-r--r--sys/amd64/vmm/vmm_dev.c538
-rw-r--r--sys/amd64/vmm/vmm_host.c124
-rw-r--r--sys/amd64/vmm/vmm_host.h75
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c810
-rw-r--r--sys/amd64/vmm/vmm_ipi.c93
-rw-r--r--sys/amd64/vmm/vmm_ipi.h39
-rw-r--r--sys/amd64/vmm/vmm_ktr.h51
-rw-r--r--sys/amd64/vmm/vmm_lapic.c201
-rw-r--r--sys/amd64/vmm/vmm_lapic.h71
-rw-r--r--sys/amd64/vmm/vmm_mem.c135
-rw-r--r--sys/amd64/vmm/vmm_mem.h37
-rw-r--r--sys/amd64/vmm/vmm_msr.c254
-rw-r--r--sys/amd64/vmm/vmm_msr.h43
-rw-r--r--sys/amd64/vmm/vmm_stat.c104
-rw-r--r--sys/amd64/vmm/vmm_stat.h71
-rw-r--r--sys/amd64/vmm/vmm_support.S42
-rw-r--r--sys/amd64/vmm/vmm_util.c111
-rw-r--r--sys/amd64/vmm/vmm_util.h40
-rw-r--r--sys/amd64/vmm/x86.c202
-rw-r--r--sys/amd64/vmm/x86.h64
46 files changed, 12243 insertions, 0 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 0000000..024c30e
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,293 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_H_
+#define _VMM_H_
+
+#ifdef _KERNEL
+
+#define VM_MAX_NAMELEN 32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+enum x2apic_state;
+
+typedef int (*vmm_init_func_t)(void);
+typedef int (*vmm_cleanup_func_t)(void);
+typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void (*vmi_cleanup_func_t)(void *vmi);
+typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+ vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot,
+ boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
+typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
+ uint64_t *retval);
+typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
+ uint64_t val);
+typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
+ int type, int vector,
+ uint32_t code, int code_valid);
+typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+ vmm_init_func_t init; /* module wide initialization */
+ vmm_cleanup_func_t cleanup;
+
+ vmi_init_func_t vminit; /* vm-specific initialization */
+ vmi_run_func_t vmrun;
+ vmi_cleanup_func_t vmcleanup;
+ vmi_mmap_set_func_t vmmmap_set;
+ vmi_mmap_get_func_t vmmmap_get;
+ vmi_get_register_t vmgetreg;
+ vmi_set_register_t vmsetreg;
+ vmi_get_desc_t vmgetdesc;
+ vmi_set_desc_t vmsetdesc;
+ vmi_inject_event_t vminject;
+ vmi_get_cap_t vmgetcap;
+ vmi_set_cap_t vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+struct vm *vm_create(const char *name);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc);
+int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
+int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+ int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+ VCPU_IDLE,
+ VCPU_RUNNING,
+ VCPU_CANNOT_RUN,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu)
+{
+ return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING);
+}
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif /* KERNEL */
+
+#include <machine/vmm_instruction_emul.h>
+
+#define VM_MAXCPU 8 /* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+ VM_EVENT_NONE,
+ VM_HW_INTR,
+ VM_NMI,
+ VM_HW_EXCEPTION,
+ VM_SW_INTR,
+ VM_PRIV_SW_EXCEPTION,
+ VM_SW_EXCEPTION,
+ VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_CR0,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_DR7,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RIP,
+ VM_REG_GUEST_RFLAGS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ VM_REG_GUEST_LDTR,
+ VM_REG_GUEST_TR,
+ VM_REG_GUEST_IDTR,
+ VM_REG_GUEST_GDTR,
+ VM_REG_GUEST_EFER,
+ VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+ VM_CAP_HALT_EXIT,
+ VM_CAP_MTRAP_EXIT,
+ VM_CAP_PAUSE_EXIT,
+ VM_CAP_UNRESTRICTED_GUEST,
+ VM_CAP_MAX
+};
+
+enum x2apic_state {
+ X2APIC_ENABLED,
+ X2APIC_AVAILABLE,
+ X2APIC_DISABLED,
+ X2APIC_STATE_LAST
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+ uint64_t base;
+ uint32_t limit;
+ uint32_t access;
+};
+
+enum vm_exitcode {
+ VM_EXITCODE_INOUT,
+ VM_EXITCODE_VMX,
+ VM_EXITCODE_BOGUS,
+ VM_EXITCODE_RDMSR,
+ VM_EXITCODE_WRMSR,
+ VM_EXITCODE_HLT,
+ VM_EXITCODE_MTRAP,
+ VM_EXITCODE_PAUSE,
+ VM_EXITCODE_PAGING,
+ VM_EXITCODE_SPINUP_AP,
+ VM_EXITCODE_MAX
+};
+
+struct vm_exit {
+ enum vm_exitcode exitcode;
+ int inst_length; /* 0 means unknown */
+ uint64_t rip;
+ union {
+ struct {
+ uint16_t bytes:3; /* 1 or 2 or 4 */
+ uint16_t in:1; /* out is 0, in is 1 */
+ uint16_t string:1;
+ uint16_t rep:1;
+ uint16_t port;
+ uint32_t eax; /* valid for out */
+ } inout;
+ struct {
+ uint64_t gpa;
+ struct vie vie;
+ } paging;
+ /*
+ * VMX specific payload. Used when there is no "better"
+ * exitcode to represent the VM-exit.
+ */
+ struct {
+ int error; /* vmx inst error */
+ uint32_t exit_reason;
+ uint64_t exit_qualification;
+ } vmx;
+ struct {
+ uint32_t code; /* ecx value */
+ uint64_t wval;
+ } msr;
+ struct {
+ int vcpu;
+ uint64_t rip;
+ } spinup_ap;
+ } u;
+};
+
+#endif /* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 0000000..79f893d
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_DEV_H_
+#define _VMM_DEV_H_
+
+#ifdef _KERNEL
+void vmmdev_init(void);
+int vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+ vm_paddr_t gpa; /* in */
+ size_t len; /* in */
+};
+
+struct vm_register {
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ uint64_t regval;
+};
+
+struct vm_seg_desc { /* data or code segment */
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ struct seg_desc desc;
+};
+
+struct vm_pin {
+ int vm_cpuid;
+ int host_cpuid; /* -1 to unpin */
+};
+
+struct vm_run {
+ int cpuid;
+ uint64_t rip; /* start running here */
+ struct vm_exit vm_exit;
+};
+
+struct vm_event {
+ int cpuid;
+ enum vm_event_type type;
+ int vector;
+ uint32_t error_code;
+ int error_code_valid;
+};
+
+struct vm_lapic_irq {
+ int cpuid;
+ int vector;
+};
+
+struct vm_capability {
+ int cpuid;
+ enum vm_cap_type captype;
+ int capval;
+ int allcpus;
+};
+
+struct vm_pptdev {
+ int bus;
+ int slot;
+ int func;
+};
+
+struct vm_pptdev_mmio {
+ int bus;
+ int slot;
+ int func;
+ vm_paddr_t gpa;
+ vm_paddr_t hpa;
+ size_t len;
+};
+
+struct vm_pptdev_msi {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int numvec; /* 0 means disabled */
+ int vector;
+ int destcpu;
+};
+
+struct vm_pptdev_msix {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int idx;
+ uint32_t msg;
+ uint32_t vector_control;
+ uint64_t addr;
+};
+
+struct vm_nmi {
+ int cpuid;
+};
+
+#define MAX_VM_STATS 64
+struct vm_stats {
+ int cpuid; /* in */
+ int num_entries; /* out */
+ struct timeval tv;
+ uint64_t statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+ int index; /* in */
+ char desc[128]; /* out */
+};
+
+struct vm_x2apic {
+ int cpuid;
+ enum x2apic_state state;
+};
+
+enum {
+ IOCNUM_RUN,
+ IOCNUM_SET_PINNING,
+ IOCNUM_GET_PINNING,
+ IOCNUM_MAP_MEMORY,
+ IOCNUM_GET_MEMORY_SEG,
+ IOCNUM_SET_REGISTER,
+ IOCNUM_GET_REGISTER,
+ IOCNUM_SET_SEGMENT_DESCRIPTOR,
+ IOCNUM_GET_SEGMENT_DESCRIPTOR,
+ IOCNUM_INJECT_EVENT,
+ IOCNUM_LAPIC_IRQ,
+ IOCNUM_SET_CAPABILITY,
+ IOCNUM_GET_CAPABILITY,
+ IOCNUM_BIND_PPTDEV,
+ IOCNUM_UNBIND_PPTDEV,
+ IOCNUM_MAP_PPTDEV_MMIO,
+ IOCNUM_PPTDEV_MSI,
+ IOCNUM_PPTDEV_MSIX,
+ IOCNUM_INJECT_NMI,
+ IOCNUM_VM_STATS,
+ IOCNUM_VM_STAT_DESC,
+ IOCNUM_SET_X2APIC_STATE,
+ IOCNUM_GET_X2APIC_STATE,
+};
+
+#define VM_RUN \
+ _IOWR('v', IOCNUM_RUN, struct vm_run)
+#define VM_SET_PINNING \
+ _IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
+#define VM_GET_PINNING \
+ _IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
+#define VM_MAP_MEMORY \
+ _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define VM_GET_MEMORY_SEG \
+ _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define VM_SET_REGISTER \
+ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define VM_GET_REGISTER \
+ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define VM_SET_SEGMENT_DESCRIPTOR \
+ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_GET_SEGMENT_DESCRIPTOR \
+ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_INJECT_EVENT \
+ _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define VM_LAPIC_IRQ \
+ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define VM_SET_CAPABILITY \
+ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define VM_GET_CAPABILITY \
+ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define VM_BIND_PPTDEV \
+ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define VM_UNBIND_PPTDEV \
+ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define VM_MAP_PPTDEV_MMIO \
+ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define VM_PPTDEV_MSI \
+ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_PPTDEV_MSIX \
+ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_INJECT_NMI \
+ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define VM_STATS \
+ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define VM_STAT_DESC \
+ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define VM_SET_X2APIC_STATE \
+ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_X2APIC_STATE \
+ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..4cc494b
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1;
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t mrr, mem_region_write_t mrw,
+ void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+ uint64_t rip, int inst_length, uint64_t cr3,
+ struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+ uint64_t gla, struct vie *vie);
+#endif /* _KERNEL */
+
+#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 0000000..dc071d3
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,265 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+ printf("amdv_init: not implemented\n");
+ return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+ printf("amdv_cleanup: not implemented\n");
+ return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+ printf("amdv_vminit: not implemented\n");
+ return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip)
+{
+
+ printf("amdv_vmrun: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+ printf("amdv_vmcleanup: not implemented\n");
+ return;
+}
+
+static int
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+ printf("amdv_vmmmap_set: not implemented\n");
+ return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+ printf("amdv_vmmmap_get: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+
+ printf("amdv_getreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+
+ printf("amdv_setreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+ uint32_t error_code, int error_code_valid)
+{
+
+ printf("amdv_inject_event: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+ printf("amdv_getcap: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+ printf("amdv_setcap: not implemented\n");
+ return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+ amdv_init,
+ amdv_cleanup,
+ amdv_vminit,
+ amdv_vmrun,
+ amdv_vmcleanup,
+ amdv_vmmmap_set,
+ amdv_vmmmap_get,
+ amdv_getreg,
+ amdv_setreg,
+ amdv_getdesc,
+ amdv_setdesc,
+ amdv_inject_event,
+ amdv_getcap,
+ amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+ printf("amd_iommu_init: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+ printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+ printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+ printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ printf("amd_iommu_create_domain: not implemented\n");
+ return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+ printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+ uint64_t len)
+{
+
+ printf("amd_iommu_create_mapping: not implemented\n");
+ return (0);
+}
+
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+ printf("amd_iommu_remove_mapping: not implemented\n");
+ return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_remove_device: not implemented\n");
+}
+
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+ printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+ amd_iommu_init,
+ amd_iommu_cleanup,
+ amd_iommu_enable,
+ amd_iommu_disable,
+ amd_iommu_create_domain,
+ amd_iommu_destroy_domain,
+ amd_iommu_create_mapping,
+ amd_iommu_remove_mapping,
+ amd_iommu_add_device,
+ amd_iommu_remove_device,
+ amd_iommu_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..4f91601
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define EPT_PWL4(cap) ((cap) & (1UL << 6))
+#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
+#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
+#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
+#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
+#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
+
+#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
+#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define INVEPT_ALL_TYPES_MASK 0x6000000UL
+#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define EPT_PG_RD (1 << 0)
+#define EPT_PG_WR (1 << 1)
+#define EPT_PG_EX (1 << 2)
+#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
+#define EPT_PG_IGNORE_PAT (1 << 6)
+#define EPT_PG_SUPERPAGE (1 << 7)
+
+#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+ int page_shift;
+ uint64_t cap;
+
+ cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+ /*
+ * Verify that:
+ * - page walk length is 4 steps
+ * - extended page tables can be laid out in write-back memory
+ * - invvpid instruction with all possible types is supported
+ * - invept instruction with all possible types is supported
+ */
+ if (!EPT_PWL4(cap) ||
+ !EPT_MEMORY_TYPE_WB(cap) ||
+ !INVVPID_SUPPORTED(cap) ||
+ !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+ !INVEPT_SUPPORTED(cap) ||
+ !INVEPT_ALL_TYPES_SUPPORTED(cap))
+ return (EINVAL);
+
+ /* Set bits in 'page_sizes_mask' for each valid page size */
+ page_shift = PAGE_SHIFT;
+ page_sizes_mask = 1UL << page_shift; /* 4KB page */
+
+ page_shift += 9;
+ if (EPT_PDE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
+
+ page_shift += 9;
+ if (EPT_PDPTE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
+
+ return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+ int i, t, tabs;
+ uint64_t *ptpnext, ptpval;
+
+ if (--nlevels < 0)
+ return;
+
+ tabs = 3 - nlevels;
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("PTP = %p\n", ptp);
+
+ for (i = 0; i < 512; i++) {
+ ptpval = ptp[i];
+
+ if (ptpval == 0)
+ continue;
+
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("%3d 0x%016lx\n", i, ptpval);
+
+ if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+ ptpnext = (uint64_t *)
+ PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ ept_dump(ptpnext, nlevels);
+ }
+ }
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+ int spshift, ptpshift, ptpindex, nlevels;
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - super page sizes supported by the processor
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = PAGE_SHIFT;
+ if (spok)
+ spshift += (EPT_PWLEVELS - 1) * 9;
+ while (spshift >= PAGE_SHIFT) {
+ uint64_t spsize = 1UL << spshift;
+ if ((page_sizes_mask & spsize) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ length >= spsize) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ if (spshift < PAGE_SHIFT) {
+ panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+ "length 0x%016lx, page_sizes_mask 0x%016lx",
+ gpa, hpa, length, page_sizes_mask);
+ }
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift)
+ break;
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create the next level page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp);
+ ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+ panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+ "mismatch\n", gpa, ptpshift);
+ }
+
+ if (prot != VM_PROT_NONE) {
+ /* Do the mapping */
+ ptp[ptpindex] = hpa;
+
+ /* Apply the access controls */
+ if (prot & VM_PROT_READ)
+ ptp[ptpindex] |= EPT_PG_RD;
+ if (prot & VM_PROT_WRITE)
+ ptp[ptpindex] |= EPT_PG_WR;
+ if (prot & VM_PROT_EXECUTE)
+ ptp[ptpindex] |= EPT_PG_EX;
+
+ /*
+ * XXX should we enforce this memory type by setting the
+ * ignore PAT bit to 1.
+ */
+ ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+ } else {
+ /* Remove the mapping */
+ ptp[ptpindex] = 0;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+ int nlevels, ptpshift, ptpindex;
+ uint64_t ptpval, hpabase, pgmask;
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ ptpval = ptp[ptpindex];
+
+ /* Cannot make progress beyond this point */
+ if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+ break;
+
+ if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+ pgmask = (1UL << ptpshift) - 1;
+ hpabase = ptpval & ~pgmask;
+ return (hpabase | (gpa & pgmask));
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ }
+
+ return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+ if (pte == 0)
+ return;
+
+ /* sanity check */
+ if ((pte & EPT_PG_SUPERPAGE) != 0)
+ panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+ return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+ pt_entry_t *pt;
+ int i;
+
+ if (pde == 0)
+ return;
+
+ if ((pde & EPT_PG_SUPERPAGE) == 0) {
+ pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+ for (i = 0; i < NPTEPG; i++)
+ ept_free_pt_entry(pt[i]);
+ free(pt, M_VMX); /* free the page table page */
+ }
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+ pd_entry_t *pd;
+ int i;
+
+ if (pdpe == 0)
+ return;
+
+ if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+ pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+ for (i = 0; i < NPDEPG; i++)
+ ept_free_pd_entry(pd[i]);
+ free(pd, M_VMX); /* free the page directory page */
+ }
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+ pdp_entry_t *pdp;
+ int i;
+
+ if (pml4e == 0)
+ return;
+
+ if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+ for (i = 0; i < NPDPEPG; i++)
+ ept_free_pdp_entry(pdp[i]);
+ free(pdp, M_VMX); /* free the page directory ptr page */
+ }
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+ int i;
+
+ for (i = 0; i < NPML4EPG; i++)
+ ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+ size_t n;
+ struct vmx *vmx = arg;
+
+ while (len > 0) {
+ n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+ prot, spok);
+ len -= n;
+ gpa += n;
+ hpa += n;
+ }
+
+ return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+ vm_paddr_t hpa;
+ struct vmx *vmx;
+
+ vmx = arg;
+ hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+ return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+ struct invept_desc desc = *(struct invept_desc *)arg;
+
+ invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+ struct invept_desc invept_desc = { 0 };
+
+ invept_desc.eptp = EPTP(pml4ept);
+
+ smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..2d7258d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EPT_H_
+#define _EPT_H_
+
+struct vmx;
+
+#define EPT_PWLEVELS 4 /* page walk levels */
+#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int ept_init(void);
+int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void ept_invalidate_mappings(u_long ept_pml4);
+void ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..a5784dd
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,551 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+ switch (encoding) {
+ case VMCS_GUEST_CR0:
+ val = vmx_fix_cr0(val);
+ break;
+ case VMCS_GUEST_CR4:
+ val = vmx_fix_cr4(val);
+ break;
+ default:
+ break;
+ }
+ return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+ switch (ident) {
+ case VM_REG_GUEST_CR0:
+ return (VMCS_GUEST_CR0);
+ case VM_REG_GUEST_CR3:
+ return (VMCS_GUEST_CR3);
+ case VM_REG_GUEST_CR4:
+ return (VMCS_GUEST_CR4);
+ case VM_REG_GUEST_DR7:
+ return (VMCS_GUEST_DR7);
+ case VM_REG_GUEST_RSP:
+ return (VMCS_GUEST_RSP);
+ case VM_REG_GUEST_RIP:
+ return (VMCS_GUEST_RIP);
+ case VM_REG_GUEST_RFLAGS:
+ return (VMCS_GUEST_RFLAGS);
+ case VM_REG_GUEST_ES:
+ return (VMCS_GUEST_ES_SELECTOR);
+ case VM_REG_GUEST_CS:
+ return (VMCS_GUEST_CS_SELECTOR);
+ case VM_REG_GUEST_SS:
+ return (VMCS_GUEST_SS_SELECTOR);
+ case VM_REG_GUEST_DS:
+ return (VMCS_GUEST_DS_SELECTOR);
+ case VM_REG_GUEST_FS:
+ return (VMCS_GUEST_FS_SELECTOR);
+ case VM_REG_GUEST_GS:
+ return (VMCS_GUEST_GS_SELECTOR);
+ case VM_REG_GUEST_TR:
+ return (VMCS_GUEST_TR_SELECTOR);
+ case VM_REG_GUEST_LDTR:
+ return (VMCS_GUEST_LDTR_SELECTOR);
+ case VM_REG_GUEST_EFER:
+ return (VMCS_GUEST_IA32_EFER);
+ default:
+ return (-1);
+ }
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+ switch (seg) {
+ case VM_REG_GUEST_ES:
+ *base = VMCS_GUEST_ES_BASE;
+ *lim = VMCS_GUEST_ES_LIMIT;
+ *acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_CS:
+ *base = VMCS_GUEST_CS_BASE;
+ *lim = VMCS_GUEST_CS_LIMIT;
+ *acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_SS:
+ *base = VMCS_GUEST_SS_BASE;
+ *lim = VMCS_GUEST_SS_LIMIT;
+ *acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_DS:
+ *base = VMCS_GUEST_DS_BASE;
+ *lim = VMCS_GUEST_DS_LIMIT;
+ *acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_FS:
+ *base = VMCS_GUEST_FS_BASE;
+ *lim = VMCS_GUEST_FS_LIMIT;
+ *acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_GS:
+ *base = VMCS_GUEST_GS_BASE;
+ *lim = VMCS_GUEST_GS_LIMIT;
+ *acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_TR:
+ *base = VMCS_GUEST_TR_BASE;
+ *lim = VMCS_GUEST_TR_LIMIT;
+ *acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_LDTR:
+ *base = VMCS_GUEST_LDTR_BASE;
+ *lim = VMCS_GUEST_LDTR_LIMIT;
+ *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_IDTR:
+ *base = VMCS_GUEST_IDTR_BASE;
+ *lim = VMCS_GUEST_IDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ case VM_REG_GUEST_GDTR:
+ *base = VMCS_GUEST_GDTR_BASE;
+ *lim = VMCS_GUEST_GDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+ int error;
+ uint32_t encoding;
+
+ /*
+ * If we need to get at vmx-specific state in the VMCS we can bypass
+ * the translation of 'ident' to 'encoding' by simply setting the
+ * sign bit. As it so happens the upper 16 bits are reserved (i.e
+ * set to 0) in the encodings for the VMCS so we are free to use the
+ * sign bit.
+ */
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ VMPTRLD(vmcs);
+ error = vmread(encoding, retval);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+ int error;
+ uint32_t encoding;
+
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ val = vmcs_fix_regval(encoding, val);
+
+ VMPTRLD(vmcs);
+ error = vmwrite(encoding, val);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_setdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmwrite(base, desc->base)) != 0)
+ goto done;
+
+ if ((error = vmwrite(limit, desc->limit)) != 0)
+ goto done;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmwrite(access, desc->access)) != 0)
+ goto done;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+ uint64_t u64;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_getdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmread(base, &u64)) != 0)
+ goto done;
+ desc->base = u64;
+
+ if ((error = vmread(limit, &u64)) != 0)
+ goto done;
+ desc->limit = u64;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmread(access, &u64)) != 0)
+ goto done;
+ desc->access = u64;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+ int error;
+
+ VMPTRLD(vmcs);
+
+ /*
+ * Guest MSRs are saved in the VM-exit MSR-store area.
+ * Guest MSRs are loaded from the VM-entry MSR-load area.
+ * Both areas point to the same location in memory.
+ */
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+ u_long host_rip, u_long host_rsp, u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+ int error, codesel, datasel, tsssel;
+ u_long cr0, cr4, efer;
+ uint64_t eptp, pat, fsbase, idtrbase;
+ uint32_t exc_bitmap;
+
+ codesel = vmm_get_host_codesel();
+ datasel = vmm_get_host_datasel();
+ tsssel = vmm_get_host_tsssel();
+
+ /*
+ * Make sure we have a "current" VMCS to work with.
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * Load the VMX controls
+ */
+ if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+ goto done;
+
+ /* Guest state */
+
+ /* Initialize guest IA32_PAT MSR with the default value */
+ pat = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Host state */
+
+ /* Initialize host IA32_PAT MSR */
+ pat = vmm_get_host_pat();
+ if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Load the IA32_EFER MSR */
+ efer = vmm_get_host_efer();
+ if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+ goto done;
+
+ /* Load the control registers */
+
+ cr0 = vmm_get_host_cr0();
+ if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = vmm_get_host_cr4() | CR4_VMXE;
+ if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+ goto done;
+
+ /* Load the segment selectors */
+ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+ goto done;
+
+ /*
+ * Load the Base-Address for %fs and idtr.
+ *
+ * Note that we exclude %gs, tss and gdtr here because their base
+ * address is pcpu specific.
+ */
+ fsbase = vmm_get_host_fsbase();
+ if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+ goto done;
+
+ idtrbase = vmm_get_host_idtrbase();
+ if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+ goto done;
+
+ /* instruction pointer */
+ if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+ goto done;
+
+ /* stack pointer */
+ if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+ goto done;
+
+ /* eptp */
+ eptp = EPTP(ept_pml4);
+ if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+ goto done;
+
+ /* vpid */
+ if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+ goto done;
+
+ /* msr bitmap */
+ if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+ goto done;
+
+ /* exception bitmap */
+ exc_bitmap = 1 << IDT_MC;
+ if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+ goto done;
+
+ /* link pointer */
+ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+ goto done;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+ int error;
+ uint64_t val;
+
+ error = vmread(encoding, &val);
+ if (error != 0)
+ panic("vmcs_read(%u) error %d", encoding, error);
+
+ return (val);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+ uint64_t cur_vmcs, val;
+ uint32_t exit;
+
+ if (!vmxon_enabled[curcpu]) {
+ db_printf("VMX not enabled\n");
+ return;
+ }
+
+ if (have_addr) {
+ db_printf("Only current VMCS supported\n");
+ return;
+ }
+
+ vmptrst(&cur_vmcs);
+ if (cur_vmcs == VMCS_INITIAL) {
+ db_printf("No current VM context\n");
+ return;
+ }
+ db_printf("VMCS: %jx\n", cur_vmcs);
+ db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+ db_printf("Activity: ");
+ val = vmcs_read(VMCS_GUEST_ACTIVITY);
+ switch (val) {
+ case 0:
+ db_printf("Active");
+ break;
+ case 1:
+ db_printf("HLT");
+ break;
+ case 2:
+ db_printf("Shutdown");
+ break;
+ case 3:
+ db_printf("Wait for SIPI");
+ break;
+ default:
+ db_printf("Unknown: %#lx", val);
+ }
+ db_printf("\n");
+ exit = vmcs_read(VMCS_EXIT_REASON);
+ if (exit & 0x80000000)
+ db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+ else
+ db_printf("Exit Reason: %u\n", exit & 0xffff);
+ db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+ db_printf("Guest Linear Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+ switch (exit & 0x8000ffff) {
+ case EXIT_REASON_EXCEPTION:
+ case EXIT_REASON_EXT_INTR:
+ val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+ db_printf("Interrupt Type: ");
+ switch (val >> 8 & 0x7) {
+ case 0:
+ db_printf("external");
+ break;
+ case 2:
+ db_printf("NMI");
+ break;
+ case 3:
+ db_printf("HW exception");
+ break;
+ case 4:
+ db_printf("SW exception");
+ break;
+ default:
+ db_printf("?? %lu", val >> 8 & 0x7);
+ break;
+ }
+ db_printf(" Vector: %lu", val & 0xff);
+ if (val & 0x800)
+ db_printf(" Error Code: %lx",
+ vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+ db_printf("\n");
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ case EXIT_REASON_EPT_MISCONFIG:
+ db_printf("Guest Physical Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+ break;
+ }
+ db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..f39eed2
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,338 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define _VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+ uint32_t identifier;
+ uint32_t abort_code;
+ char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+ u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap,
+ uint16_t vpid);
+int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int vmcs_getdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+int vmcs_setdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
+#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
+#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+
+#endif /* _KERNEL */
+
+#define VMCS_INITIAL 0xffffffffffffffff
+
+#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define VMCS_INVALID_ENCODING 0xffffffff
+
+/* 16-bit control fields */
+#define VMCS_VPID 0x00000000
+
+/* 16-bit guest-state fields */
+#define VMCS_GUEST_ES_SELECTOR 0x00000800
+#define VMCS_GUEST_CS_SELECTOR 0x00000802
+#define VMCS_GUEST_SS_SELECTOR 0x00000804
+#define VMCS_GUEST_DS_SELECTOR 0x00000806
+#define VMCS_GUEST_FS_SELECTOR 0x00000808
+#define VMCS_GUEST_GS_SELECTOR 0x0000080A
+#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
+#define VMCS_GUEST_TR_SELECTOR 0x0000080E
+
+/* 16-bit host-state fields */
+#define VMCS_HOST_ES_SELECTOR 0x00000C00
+#define VMCS_HOST_CS_SELECTOR 0x00000C02
+#define VMCS_HOST_SS_SELECTOR 0x00000C04
+#define VMCS_HOST_DS_SELECTOR 0x00000C06
+#define VMCS_HOST_FS_SELECTOR 0x00000C08
+#define VMCS_HOST_GS_SELECTOR 0x00000C0A
+#define VMCS_HOST_TR_SELECTOR 0x00000C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x00002000
+#define VMCS_IO_BITMAP_B 0x00002002
+#define VMCS_MSR_BITMAP 0x00002004
+#define VMCS_EXIT_MSR_STORE 0x00002006
+#define VMCS_EXIT_MSR_LOAD 0x00002008
+#define VMCS_ENTRY_MSR_LOAD 0x0000200A
+#define VMCS_EXECUTIVE_VMCS 0x0000200C
+#define VMCS_TSC_OFFSET 0x00002010
+#define VMCS_VIRTUAL_APIC 0x00002012
+#define VMCS_APIC_ACCESS 0x00002014
+#define VMCS_EPTP 0x0000201A
+
+/* 64-bit read-only fields */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+
+/* 64-bit guest-state fields */
+#define VMCS_LINK_POINTER 0x00002800
+#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
+#define VMCS_GUEST_IA32_PAT 0x00002804
+#define VMCS_GUEST_IA32_EFER 0x00002806
+#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define VMCS_GUEST_PDPTE0 0x0000280A
+#define VMCS_GUEST_PDPTE1 0x0000280C
+#define VMCS_GUEST_PDPTE2 0x0000280E
+#define VMCS_GUEST_PDPTE3 0x00002810
+
+/* 64-bit host-state fields */
+#define VMCS_HOST_IA32_PAT 0x00002C00
+#define VMCS_HOST_IA32_EFER 0x00002C02
+#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
+
+/* 32-bit control fields */
+#define VMCS_PIN_BASED_CTLS 0x00004000
+#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
+#define VMCS_EXCEPTION_BITMAP 0x00004004
+#define VMCS_PF_ERROR_MASK 0x00004006
+#define VMCS_PF_ERROR_MATCH 0x00004008
+#define VMCS_CR3_TARGET_COUNT 0x0000400A
+#define VMCS_EXIT_CTLS 0x0000400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
+#define VMCS_ENTRY_CTLS 0x00004012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
+#define VMCS_ENTRY_INTR_INFO 0x00004016
+#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
+#define VMCS_ENTRY_INST_LENGTH 0x0000401A
+#define VMCS_TPR_THRESHOLD 0x0000401C
+#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
+#define VMCS_PLE_GAP 0x00004020
+#define VMCS_PLE_WINDOW 0x00004022
+
+/* 32-bit read-only data fields */
+#define VMCS_INSTRUCTION_ERROR 0x00004400
+#define VMCS_EXIT_REASON 0x00004402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
+#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
+#define VMCS_IDT_VECTORING_INFO 0x00004408
+#define VMCS_IDT_VECTORING_ERROR 0x0000440A
+#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
+
+/* 32-bit guest-state fields */
+#define VMCS_GUEST_ES_LIMIT 0x00004800
+#define VMCS_GUEST_CS_LIMIT 0x00004802
+#define VMCS_GUEST_SS_LIMIT 0x00004804
+#define VMCS_GUEST_DS_LIMIT 0x00004806
+#define VMCS_GUEST_FS_LIMIT 0x00004808
+#define VMCS_GUEST_GS_LIMIT 0x0000480A
+#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
+#define VMCS_GUEST_TR_LIMIT 0x0000480E
+#define VMCS_GUEST_GDTR_LIMIT 0x00004810
+#define VMCS_GUEST_IDTR_LIMIT 0x00004812
+#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
+#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
+#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
+#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
+#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
+#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
+#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
+#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
+#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
+#define VMCS_GUEST_ACTIVITY 0x00004826
+#define VMCS_GUEST_SMBASE 0x00004828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
+#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
+
+/* 32-bit host state fields */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
+
+/* Natural Width control fields */
+#define VMCS_CR0_MASK 0x00006000
+#define VMCS_CR4_MASK 0x00006002
+#define VMCS_CR0_SHADOW 0x00006004
+#define VMCS_CR4_SHADOW 0x00006006
+#define VMCS_CR3_TARGET0 0x00006008
+#define VMCS_CR3_TARGET1 0x0000600A
+#define VMCS_CR3_TARGET2 0x0000600C
+#define VMCS_CR3_TARGET3 0x0000600E
+
+/* Natural Width read-only fields */
+#define VMCS_EXIT_QUALIFICATION 0x00006400
+#define VMCS_IO_RCX 0x00006402
+#define VMCS_IO_RSI 0x00006404
+#define VMCS_IO_RDI 0x00006406
+#define VMCS_IO_RIP 0x00006408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
+
+/* Natural Width guest-state fields */
+#define VMCS_GUEST_CR0 0x00006800
+#define VMCS_GUEST_CR3 0x00006802
+#define VMCS_GUEST_CR4 0x00006804
+#define VMCS_GUEST_ES_BASE 0x00006806
+#define VMCS_GUEST_CS_BASE 0x00006808
+#define VMCS_GUEST_SS_BASE 0x0000680A
+#define VMCS_GUEST_DS_BASE 0x0000680C
+#define VMCS_GUEST_FS_BASE 0x0000680E
+#define VMCS_GUEST_GS_BASE 0x00006810
+#define VMCS_GUEST_LDTR_BASE 0x00006812
+#define VMCS_GUEST_TR_BASE 0x00006814
+#define VMCS_GUEST_GDTR_BASE 0x00006816
+#define VMCS_GUEST_IDTR_BASE 0x00006818
+#define VMCS_GUEST_DR7 0x0000681A
+#define VMCS_GUEST_RSP 0x0000681C
+#define VMCS_GUEST_RIP 0x0000681E
+#define VMCS_GUEST_RFLAGS 0x00006820
+#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
+
+/* Natural Width host-state fields */
+#define VMCS_HOST_CR0 0x00006C00
+#define VMCS_HOST_CR3 0x00006C02
+#define VMCS_HOST_CR4 0x00006C04
+#define VMCS_HOST_FS_BASE 0x00006C06
+#define VMCS_HOST_GS_BASE 0x00006C08
+#define VMCS_HOST_TR_BASE 0x00006C0A
+#define VMCS_HOST_GDTR_BASE 0x00006C0C
+#define VMCS_HOST_IDTR_BASE 0x00006C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
+#define VMCS_HOST_RSP 0x00006C14
+#define VMCS_HOST_RIP 0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION 0
+#define EXIT_REASON_EXT_INTR 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT 3
+#define EXIT_REASON_SIPI 4
+#define EXIT_REASON_IO_SMI 5
+#define EXIT_REASON_SMI 6
+#define EXIT_REASON_INTR_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_GETSEC 11
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_RSM 17
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMXOFF 26
+#define EXIT_REASON_VMXON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_INOUT 30
+#define EXIT_REASON_RDMSR 31
+#define EXIT_REASON_WRMSR 32
+#define EXIT_REASON_INVAL_VMCS 33
+#define EXIT_REASON_INVAL_MSR 34
+#define EXIT_REASON_MWAIT 36
+#define EXIT_REASON_MTF 37
+#define EXIT_REASON_MONITOR 39
+#define EXIT_REASON_PAUSE 40
+#define EXIT_REASON_MCE 41
+#define EXIT_REASON_TPR 43
+#define EXIT_REASON_APIC 44
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_FAULT 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_VMX_PREEMPT 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define VMCS_INTERRUPTION_INFO_VALID (1U << 31)
+#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
+#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
+#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
+#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
+#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define EPT_VIOLATION_DATA_READ (1UL << 0)
+#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
+#define EPT_VIOLATION_INST_FETCH (1UL << 2)
+#define EPT_VIOLATION_GLA_VALID (1UL << 7)
+#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..4f267bb
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1845 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define PINBASED_CTLS_ONE_SETTING \
+ (PINBASED_EXTINT_EXITING | \
+ PINBASED_NMI_EXITING | \
+ PINBASED_VIRTUAL_NMI)
+#define PINBASED_CTLS_ZERO_SETTING 0
+
+#define PROCBASED_CTLS_WINDOW_SETTING \
+ (PROCBASED_INT_WINDOW_EXITING | \
+ PROCBASED_NMI_WINDOW_EXITING)
+
+#define PROCBASED_CTLS_ONE_SETTING \
+ (PROCBASED_SECONDARY_CONTROLS | \
+ PROCBASED_IO_EXITING | \
+ PROCBASED_MSR_BITMAPS | \
+ PROCBASED_CTLS_WINDOW_SETTING)
+#define PROCBASED_CTLS_ZERO_SETTING \
+ (PROCBASED_CR3_LOAD_EXITING | \
+ PROCBASED_CR3_STORE_EXITING | \
+ PROCBASED_IO_BITMAPS)
+
+#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
+#define PROCBASED_CTLS2_ZERO_SETTING 0
+
+#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \
+ (VM_EXIT_HOST_LMA | \
+ VM_EXIT_SAVE_EFER | \
+ VM_EXIT_LOAD_EFER)
+
+#define VM_EXIT_CTLS_ONE_SETTING \
+ (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \
+ VM_EXIT_SAVE_PAT | \
+ VM_EXIT_LOAD_PAT)
+#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER
+
+#define VM_ENTRY_CTLS_ONE_SETTING \
+ (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \
+ VM_ENTRY_LOAD_PAT)
+#define VM_ENTRY_CTLS_ZERO_SETTING \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_INTO_SMM | \
+ VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define guest_msr_rw(vmx, msr) \
+ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define HANDLED 1
+#define UNHANDLED 0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+static int vmx_no_patmsr;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+ VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
+static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+ static char reasonbuf[32];
+
+ switch (reason) {
+ case EXIT_REASON_EXCEPTION:
+ return "exception";
+ case EXIT_REASON_EXT_INTR:
+ return "extint";
+ case EXIT_REASON_TRIPLE_FAULT:
+ return "triplefault";
+ case EXIT_REASON_INIT:
+ return "init";
+ case EXIT_REASON_SIPI:
+ return "sipi";
+ case EXIT_REASON_IO_SMI:
+ return "iosmi";
+ case EXIT_REASON_SMI:
+ return "smi";
+ case EXIT_REASON_INTR_WINDOW:
+ return "intrwindow";
+ case EXIT_REASON_NMI_WINDOW:
+ return "nmiwindow";
+ case EXIT_REASON_TASK_SWITCH:
+ return "taskswitch";
+ case EXIT_REASON_CPUID:
+ return "cpuid";
+ case EXIT_REASON_GETSEC:
+ return "getsec";
+ case EXIT_REASON_HLT:
+ return "hlt";
+ case EXIT_REASON_INVD:
+ return "invd";
+ case EXIT_REASON_INVLPG:
+ return "invlpg";
+ case EXIT_REASON_RDPMC:
+ return "rdpmc";
+ case EXIT_REASON_RDTSC:
+ return "rdtsc";
+ case EXIT_REASON_RSM:
+ return "rsm";
+ case EXIT_REASON_VMCALL:
+ return "vmcall";
+ case EXIT_REASON_VMCLEAR:
+ return "vmclear";
+ case EXIT_REASON_VMLAUNCH:
+ return "vmlaunch";
+ case EXIT_REASON_VMPTRLD:
+ return "vmptrld";
+ case EXIT_REASON_VMPTRST:
+ return "vmptrst";
+ case EXIT_REASON_VMREAD:
+ return "vmread";
+ case EXIT_REASON_VMRESUME:
+ return "vmresume";
+ case EXIT_REASON_VMWRITE:
+ return "vmwrite";
+ case EXIT_REASON_VMXOFF:
+ return "vmxoff";
+ case EXIT_REASON_VMXON:
+ return "vmxon";
+ case EXIT_REASON_CR_ACCESS:
+ return "craccess";
+ case EXIT_REASON_DR_ACCESS:
+ return "draccess";
+ case EXIT_REASON_INOUT:
+ return "inout";
+ case EXIT_REASON_RDMSR:
+ return "rdmsr";
+ case EXIT_REASON_WRMSR:
+ return "wrmsr";
+ case EXIT_REASON_INVAL_VMCS:
+ return "invalvmcs";
+ case EXIT_REASON_INVAL_MSR:
+ return "invalmsr";
+ case EXIT_REASON_MWAIT:
+ return "mwait";
+ case EXIT_REASON_MTF:
+ return "mtf";
+ case EXIT_REASON_MONITOR:
+ return "monitor";
+ case EXIT_REASON_PAUSE:
+ return "pause";
+ case EXIT_REASON_MCE:
+ return "mce";
+ case EXIT_REASON_TPR:
+ return "tpr";
+ case EXIT_REASON_APIC:
+ return "apic";
+ case EXIT_REASON_GDTR_IDTR:
+ return "gdtridtr";
+ case EXIT_REASON_LDTR_TR:
+ return "ldtrtr";
+ case EXIT_REASON_EPT_FAULT:
+ return "eptfault";
+ case EXIT_REASON_EPT_MISCONFIG:
+ return "eptmisconfig";
+ case EXIT_REASON_INVEPT:
+ return "invept";
+ case EXIT_REASON_RDTSCP:
+ return "rdtscp";
+ case EXIT_REASON_VMX_PREEMPT:
+ return "vmxpreempt";
+ case EXIT_REASON_INVVPID:
+ return "invvpid";
+ case EXIT_REASON_WBINVD:
+ return "wbinvd";
+ case EXIT_REASON_XSETBV:
+ return "xsetbv";
+ default:
+ snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+ return (reasonbuf);
+ }
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ return "direct";
+ case VMX_RETURN_LONGJMP:
+ return "longjmp";
+ case VMX_RETURN_VMRESUME:
+ return "vmresume";
+ case VMX_RETURN_VMLAUNCH:
+ return "vmlaunch";
+ case VMX_RETURN_AST:
+ return "ast";
+ default:
+ return "unknown";
+ }
+}
+
+#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \
+ VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+ (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ uint64_t host_rip, host_rsp;
+
+ if (vmxctx != &vmx->ctx[vcpu])
+ panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+ vmxctx, &vmx->ctx[vcpu]);
+
+ VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+ VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+ vmx_setjmp_rc2str(rc), rc);
+
+ host_rsp = host_rip = ~0;
+ vmread(VMCS_HOST_RIP, &host_rip);
+ vmread(VMCS_HOST_RSP, &host_rsp);
+ VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+ host_rip, host_rsp);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ return;
+}
+#endif /* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+ return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+ return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+ int cnt;
+
+ static struct msr_entry guest_msrs[] = {
+ { MSR_KGSBASE, 0, 0 },
+ };
+
+ cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+ if (cnt > GUEST_MSR_MAX_ENTRIES)
+ panic("guest msr save area overrun");
+ bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+ *g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+ struct invvpid_desc invvpid_desc = { 0 };
+ struct invept_desc invept_desc = { 0 };
+
+ if (vmxon_enabled[curcpu]) {
+ /*
+ * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+ *
+ * VMXON or VMXOFF are not required to invalidate any TLB
+ * caching structures. This prevents potential retention of
+ * cached information in the TLB between distinct VMX episodes.
+ */
+ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+ invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+ vmxoff();
+ }
+ load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+ smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+ return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+ int error;
+
+ load_cr4(rcr4() | CR4_VMXE);
+
+ *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+ error = vmxon(vmxon_region[curcpu]);
+ if (error == 0)
+ vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+ int error;
+ uint64_t fixed0, fixed1, feature_control;
+ uint32_t tmp;
+
+ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+ if (!(cpu_feature2 & CPUID2_VMX)) {
+ printf("vmx_init: processor does not support VMX operation\n");
+ return (ENXIO);
+ }
+
+ /*
+ * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+ * are set (bits 0 and 2 respectively).
+ */
+ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if ((feature_control & 0x5) != 0x5) {
+ printf("vmx_init: VMX operation disabled by BIOS\n");
+ return (ENXIO);
+ }
+
+ /* Check support for primary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_CTLS_ONE_SETTING,
+ PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired primary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Clear the processor-based ctl bits that are set on demand */
+ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+ /* Check support for secondary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED_CTLS2_ONE_SETTING,
+ PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+ if (error) {
+ printf("vmx_init: processor does not support desired secondary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VPID */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_ENABLE_VPID, 0, &tmp);
+ if (error == 0)
+ procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+ /* Check support for pin-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS,
+ PINBASED_CTLS_ONE_SETTING,
+ PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "pin-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-exit controls */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ /* Try again without the PAT MSR bits */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
+ MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "exit controls\n");
+ return (error);
+ } else {
+ if (bootverbose)
+ printf("vmm: PAT MSR access not supported\n");
+ guest_msr_valid(MSR_PAT);
+ vmx_no_patmsr = 1;
+ }
+ }
+
+ /* Check support for VM-entry controls */
+ if (!vmx_no_patmsr) {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ } else {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ }
+
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "entry controls\n");
+ return (error);
+ }
+
+ /*
+ * Check support for optional features by testing them
+ * as individual bits
+ */
+ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_HLT_EXITING, 0,
+ &tmp) == 0);
+
+ cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_PROCBASED_CTLS,
+ PROCBASED_MTF, 0,
+ &tmp) == 0);
+
+ cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_PAUSE_EXITING, 0,
+ &tmp) == 0);
+
+ cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_UNRESTRICTED_GUEST, 0,
+ &tmp) == 0);
+
+ /* Initialize EPT */
+ error = ept_init();
+ if (error) {
+ printf("vmx_init: ept initialization failed (%d)\n", error);
+ return (error);
+ }
+
+ /*
+ * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+ */
+ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+ cr0_ones_mask = fixed0 & fixed1;
+ cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+ /*
+ * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+ * if unrestricted guest execution is allowed.
+ */
+ if (cap_unrestricted_guest)
+ cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+ /*
+ * Do not allow the guest to set CR0_NW or CR0_CD.
+ */
+ cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+ fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+ cr4_ones_mask = fixed0 & fixed1;
+ cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+ /* enable VMX operation */
+ smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+ uint16_t vpid = 0;
+
+ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+ do {
+ vpid = atomic_fetchadd_int(&nextvpid, 1);
+ } while (vpid == 0);
+ }
+
+ return (vpid);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs)
+{
+ int error, mask_ident, shadow_ident;
+ uint64_t mask_value, shadow_value;
+
+ if (which != 0 && which != 4)
+ panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+ if (which == 0) {
+ mask_ident = VMCS_CR0_MASK;
+ mask_value = cr0_ones_mask | cr0_zeros_mask;
+ shadow_ident = VMCS_CR0_SHADOW;
+ shadow_value = cr0_ones_mask;
+ } else {
+ mask_ident = VMCS_CR4_MASK;
+ mask_value = cr4_ones_mask | cr4_zeros_mask;
+ shadow_ident = VMCS_CR4_SHADOW;
+ shadow_value = cr4_ones_mask;
+ }
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value);
+ if (error)
+ return (error);
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value);
+ if (error)
+ return (error);
+
+ return (0);
+}
+#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs))
+#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+ uint16_t vpid;
+ int i, error, guest_msr_count;
+ struct vmx *vmx;
+
+ vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+ if ((uintptr_t)vmx & PAGE_MASK) {
+ panic("malloc of struct vmx not aligned on %d byte boundary",
+ PAGE_SIZE);
+ }
+ vmx->vm = vm;
+
+ /*
+ * Clean up EPTP-tagged guest physical and combined mappings
+ *
+ * VMX transitions are not required to invalidate any guest physical
+ * mappings. So, it may be possible for stale guest physical mappings
+ * to be present in the processor TLBs.
+ *
+ * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+ */
+ ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+ msr_bitmap_initialize(vmx->msr_bitmap);
+
+ /*
+ * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+ * The guest FSBASE and GSBASE are saved and restored during
+ * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+ * always restored from the vmcs host state area on vm-exit.
+ *
+ * Guest KGSBASE is saved and restored in the guest MSR save area.
+ * Host KGSBASE is restored before returning to userland from the pcb.
+ * There will be a window of time when we are executing in the host
+ * kernel context with a value of KGSBASE from the guest. This is ok
+ * because the value of KGSBASE is inconsequential in kernel context.
+ *
+ * MSR_EFER is saved and restored in the guest VMCS area on a
+ * VM exit and entry respectively. It is also restored from the
+ * host VMCS area on a VM exit.
+ */
+ if (guest_msr_rw(vmx, MSR_GSBASE) ||
+ guest_msr_rw(vmx, MSR_FSBASE) ||
+ guest_msr_rw(vmx, MSR_KGSBASE) ||
+ guest_msr_rw(vmx, MSR_EFER))
+ panic("vmx_vminit: error setting guest msr access");
+
+ /*
+ * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+ * and entry respectively. It is also restored from the host VMCS
+ * area on a VM exit. However, if running on a system with no
+ * MSR_PAT save/restore support, leave access disabled so accesses
+ * will be trapped.
+ */
+ if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+ panic("vmx_vminit: error setting guest pat msr access");
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vmx->vmcs[i].identifier = vmx_revision();
+ error = vmclear(&vmx->vmcs[i]);
+ if (error != 0) {
+ panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+ error, i);
+ }
+
+ vpid = vmx_vpid();
+
+ error = vmcs_set_defaults(&vmx->vmcs[i],
+ (u_long)vmx_longjmp,
+ (u_long)&vmx->ctx[i],
+ vtophys(vmx->pml4ept),
+ pinbased_ctls,
+ procbased_ctls,
+ procbased_ctls2,
+ exit_ctls, entry_ctls,
+ vtophys(vmx->msr_bitmap),
+ vpid);
+
+ if (error != 0)
+ panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+ vmx->cap[i].set = 0;
+ vmx->cap[i].proc_ctls = procbased_ctls;
+
+ vmx->state[i].lastcpu = -1;
+ vmx->state[i].vpid = vpid;
+
+ msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+ error = vmcs_set_msr_save(&vmx->vmcs[i],
+ vtophys(vmx->guest_msrs[i]),
+ guest_msr_count);
+ if (error != 0)
+ panic("vmcs_set_msr_save error %d", error);
+
+ error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr0_shadow %d", error);
+
+ error = vmx_setup_cr4_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr4_shadow %d", error);
+ }
+
+ return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+ int handled, func;
+
+ func = vmxctx->guest_rax;
+
+ handled = x86_emulate_cpuid(vm, vcpu,
+ (uint32_t*)(&vmxctx->guest_rax),
+ (uint32_t*)(&vmxctx->guest_rbx),
+ (uint32_t*)(&vmxctx->guest_rcx),
+ (uint32_t*)(&vmxctx->guest_rdx));
+ return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+ int handled)
+{
+#ifdef KTR
+ VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+ handled ? "handled" : "unhandled",
+ exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+ int error, lastcpu;
+ struct vmxstate *vmxstate;
+ struct invvpid_desc invvpid_desc = { 0 };
+
+ vmxstate = &vmx->state[vcpu];
+ lastcpu = vmxstate->lastcpu;
+ vmxstate->lastcpu = curcpu;
+
+ if (lastcpu == curcpu) {
+ error = 0;
+ goto done;
+ }
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ if (error != 0)
+ goto done;
+
+ /*
+ * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ *
+ * We do this because this vcpu was executing on a different host
+ * cpu when it last ran. We do not track whether it invalidated
+ * mappings associated with its 'vpid' during that run. So we must
+ * assume that the mappings associated with 'vpid' on 'curcpu' are
+ * stale and invalidate them.
+ *
+ * Note that we incur this penalty only when the scheduler chooses to
+ * move the thread associated with this vcpu between host cpus.
+ *
+ * Note also that this will invalidate mappings tagged with 'vpid'
+ * for "all" EP4TAs.
+ */
+ if (vmxstate->vpid != 0) {
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ }
+done:
+ return (error);
+}
+
+static void
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+ int error;
+
+ error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+ if (error)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+ int error;
+ uint64_t info, interruptibility;
+
+ /* Bail out if no NMI requested */
+ if (!vm_nmi_pending(vmx->vm, vcpu))
+ return (0);
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_nmi: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & nmi_blocking_bits)
+ goto nmiblocked;
+
+ /*
+ * Inject the virtual NMI. The vector must be the NMI IDT entry
+ * or the VMCS entry check will fail.
+ */
+ info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+ info |= IDT_NMI;
+
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+ VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+ /* Clear the request */
+ vm_nmi_clear(vmx->vm, vcpu);
+ return (1);
+
+nmiblocked:
+ /*
+ * Set the NMI Window Exiting execution control so we can inject
+ * the virtual NMI as soon as blocking condition goes away.
+ */
+ vmx_set_nmi_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+ return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+ int error, vector;
+ uint64_t info, rflags, interruptibility;
+
+ const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+ /*
+ * If there is already an interrupt pending then just return.
+ *
+ * This could happen if an interrupt was injected on a prior
+ * VM entry but the actual entry into guest mode was aborted
+ * because of a pending AST.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return;
+
+ /*
+ * NMI injection has priority so deal with those first
+ */
+ if (vmx_inject_nmi(vmx, vcpu))
+ return;
+
+ /* Ask the local apic for a vector to inject */
+ vector = lapic_pending_intr(vmx->vm, vcpu);
+ if (vector < 0)
+ return;
+
+ if (vector < 32 || vector > 255)
+ panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+ /* Check RFLAGS.IF and the interruptibility state of the guest */
+ error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+ if ((rflags & PSL_I) == 0)
+ goto cantinject;
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & HWINTR_BLOCKED)
+ goto cantinject;
+
+ /* Inject the interrupt */
+ info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+ info |= vector;
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+ /* Update the Local APIC ISR */
+ lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+ VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+ return;
+
+cantinject:
+ /*
+ * Set the Interrupt Window Exiting execution control so we can inject
+ * the interrupt as soon as blocking condition goes away.
+ */
+ vmx_set_int_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ int error, cr, vmcs_guest_cr;
+ uint64_t regval, ones_mask, zeros_mask;
+ const struct vmxctx *vmxctx;
+
+ /* We only handle mov to %cr0 or %cr4 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ cr = exitqual & 0xf;
+ if (cr != 0 && cr != 4)
+ return (UNHANDLED);
+
+ vmxctx = &vmx->ctx[vcpu];
+
+ /*
+ * We must use vmwrite() directly here because vmcs_setreg() will
+ * call vmclear(vmcs) as a side-effect which we certainly don't want.
+ */
+ switch ((exitqual >> 8) & 0xf) {
+ case 0:
+ regval = vmxctx->guest_rax;
+ break;
+ case 1:
+ regval = vmxctx->guest_rcx;
+ break;
+ case 2:
+ regval = vmxctx->guest_rdx;
+ break;
+ case 3:
+ regval = vmxctx->guest_rbx;
+ break;
+ case 4:
+ error = vmread(VMCS_GUEST_RSP, &regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: "
+ "error %d reading guest rsp", error);
+ }
+ break;
+ case 5:
+ regval = vmxctx->guest_rbp;
+ break;
+ case 6:
+ regval = vmxctx->guest_rsi;
+ break;
+ case 7:
+ regval = vmxctx->guest_rdi;
+ break;
+ case 8:
+ regval = vmxctx->guest_r8;
+ break;
+ case 9:
+ regval = vmxctx->guest_r9;
+ break;
+ case 10:
+ regval = vmxctx->guest_r10;
+ break;
+ case 11:
+ regval = vmxctx->guest_r11;
+ break;
+ case 12:
+ regval = vmxctx->guest_r12;
+ break;
+ case 13:
+ regval = vmxctx->guest_r13;
+ break;
+ case 14:
+ regval = vmxctx->guest_r14;
+ break;
+ case 15:
+ regval = vmxctx->guest_r15;
+ break;
+ }
+
+ if (cr == 0) {
+ ones_mask = cr0_ones_mask;
+ zeros_mask = cr0_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR0;
+ } else {
+ ones_mask = cr4_ones_mask;
+ zeros_mask = cr4_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR4;
+ }
+ regval |= ones_mask;
+ regval &= ~zeros_mask;
+ error = vmwrite(vmcs_guest_cr, regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: error %d writing cr%d",
+ error, cr);
+ }
+
+ return (HANDLED);
+}
+
+static int
+vmx_ept_fault(struct vm *vm, int cpu,
+ uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+ uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+{
+ int read, write, error;
+
+ /* EPT violation on an instruction fetch doesn't make sense here */
+ if (ept_qual & EPT_VIOLATION_INST_FETCH)
+ return (UNHANDLED);
+
+ /* EPT violation must be a read fault or a write fault */
+ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+ write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+ if ((read | write) == 0)
+ return (UNHANDLED);
+
+ /*
+ * The EPT violation must have been caused by accessing a
+ * guest-physical address that is a translation of a guest-linear
+ * address.
+ */
+ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+ (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+ return (UNHANDLED);
+ }
+
+ /* Fetch, decode and emulate the faulting instruction */
+ if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
+ return (UNHANDLED);
+
+ if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
+ return (UNHANDLED);
+
+ /*
+ * Check if this is a local apic access
+ */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+ return (UNHANDLED);
+
+ error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ return (error ? UNHANDLED : HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+ int error, handled;
+ struct vmcs *vmcs;
+ struct vmxctx *vmxctx;
+ uint32_t eax, ecx, edx;
+ uint64_t qual, gla, gpa, cr3, intr_info;
+
+ handled = 0;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ qual = vmexit->u.vmx.exit_qualification;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+ switch (vmexit->u.vmx.exit_reason) {
+ case EXIT_REASON_CR_ACCESS:
+ handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+ break;
+ case EXIT_REASON_RDMSR:
+ ecx = vmxctx->guest_rcx;
+ error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_RDMSR;
+ vmexit->u.msr.code = ecx;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_WRMSR:
+ eax = vmxctx->guest_rax;
+ ecx = vmxctx->guest_rcx;
+ edx = vmxctx->guest_rdx;
+ error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+ (uint64_t)edx << 32 | eax);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_WRMSR;
+ vmexit->u.msr.code = ecx;
+ vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_HLT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+ /*
+ * If there is an event waiting to be injected then there is
+ * no need to 'hlt'.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
+ if (error)
+ panic("vmx_exit_process: vmread(intrinfo) %d", error);
+
+ if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
+ handled = 1;
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
+ } else
+ vmexit->exitcode = VM_EXITCODE_HLT;
+ break;
+ case EXIT_REASON_MTF:
+ vmexit->exitcode = VM_EXITCODE_MTRAP;
+ break;
+ case EXIT_REASON_PAUSE:
+ vmexit->exitcode = VM_EXITCODE_PAUSE;
+ break;
+ case EXIT_REASON_INTR_WINDOW:
+ vmx_clear_int_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+ /* FALLTHRU */
+ case EXIT_REASON_EXT_INTR:
+ /*
+ * External interrupts serve only to cause VM exits and allow
+ * the host interrupt handler to run.
+ *
+ * If this external interrupt triggers a virtual interrupt
+ * to a VM, then that state will be recorded by the
+ * host interrupt handler in the VM's softc. We will inject
+ * this virtual interrupt during the subsequent VM enter.
+ */
+
+ /*
+ * This is special. We want to treat this as an 'handled'
+ * VM-exit but not increment the instruction pointer.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+ return (1);
+ case EXIT_REASON_NMI_WINDOW:
+ /* Exit to allow the pending virtual NMI to be injected */
+ vmx_clear_nmi_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+ return (1);
+ case EXIT_REASON_INOUT:
+ vmexit->exitcode = VM_EXITCODE_INOUT;
+ vmexit->u.inout.bytes = (qual & 0x7) + 1;
+ vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+ vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+ vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+ vmexit->u.inout.port = (uint16_t)(qual >> 16);
+ vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+ break;
+ case EXIT_REASON_CPUID:
+ handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ gla = vmcs_gla();
+ gpa = vmcs_gpa();
+ cr3 = vmcs_guest_cr3();
+ handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+ vmexit->rip, vmexit->inst_length,
+ cr3, qual, &vmexit->u.paging.vie);
+ if (!handled) {
+ vmexit->exitcode = VM_EXITCODE_PAGING;
+ vmexit->u.paging.gpa = gpa;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (handled) {
+ /*
+ * It is possible that control is returned to userland
+ * even though we were able to handle the VM exit in the
+ * kernel.
+ *
+ * In such a case we want to make sure that the userland
+ * restarts guest execution at the instruction *after*
+ * the one we just processed. Therefore we update the
+ * guest rip in the VMCS and in 'vmexit'.
+ */
+ vm_exit_update_rip(vmexit);
+ vmexit->rip += vmexit->inst_length;
+ vmexit->inst_length = 0;
+
+ /*
+ * Special case for spinning up an AP - exit to userspace to
+ * give the controlling process a chance to intercept and
+ * spin up a thread for the AP.
+ */
+ if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+ handled = 0;
+ } else {
+ if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+ /*
+ * If this VM exit was not claimed by anybody then
+ * treat it as a generic VMX exit.
+ */
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.error = 0;
+ } else {
+ /*
+ * The exitcode and collateral have been populated.
+ * The VM exit will be processed further in userland.
+ */
+ }
+ }
+ return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+ int error, vie, rc, handled, astpending;
+ uint32_t exit_reason;
+ struct vmx *vmx;
+ struct vmxctx *vmxctx;
+ struct vmcs *vmcs;
+ struct vm_exit *vmexit;
+
+ vmx = arg;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ vmxctx->launched = 0;
+
+ astpending = 0;
+ vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+ /*
+ * XXX Can we avoid doing this every time we do a vm run?
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * XXX
+ * We do this every time because we may setup the virtual machine
+ * from a different process than the one that actually runs it.
+ *
+ * If the life of a virtual machine was spent entirely in the context
+ * of a single process we could do this once in vmcs_set_defaults().
+ */
+ if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+ panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+ if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+ if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+ panic("vmx_run: error %d setting up pcpu defaults", error);
+
+ do {
+ lapic_timer_tick(vmx->vm, vcpu);
+ vmx_inject_interrupts(vmx, vcpu);
+ vmx_run_trace(vmx, vcpu);
+ rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+ vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ if (vmxctx->launched == 0) {
+ vmxctx->launched = 1;
+ vmx_launch(vmxctx);
+ } else
+ vmx_resume(vmxctx);
+ panic("vmx_launch/resume should not return");
+ break;
+ case VMX_RETURN_LONGJMP:
+ break; /* vm exit */
+ case VMX_RETURN_AST:
+ astpending = 1;
+ break;
+ case VMX_RETURN_VMRESUME:
+ vie = vmcs_instruction_error();
+ if (vmxctx->launch_error == VM_FAIL_INVALID ||
+ vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+ printf("vmresume error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+ goto err_exit;
+ }
+ vmx_launch(vmxctx); /* try to launch the guest */
+ panic("vmx_launch should not return");
+ break;
+ case VMX_RETURN_VMLAUNCH:
+ vie = vmcs_instruction_error();
+#if 1
+ printf("vmlaunch error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+#endif
+ goto err_exit;
+ default:
+ panic("vmx_setjmp returned %d", rc);
+ }
+
+ /* enable interrupts */
+ enable_intr();
+
+ /* collect some basic information for VM exit processing */
+ vmexit->rip = rip = vmcs_guest_rip();
+ vmexit->inst_length = vmexit_instruction_length();
+ vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+ vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+ if (astpending) {
+ handled = 1;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+ vmx_astpending_trace(vmx, vcpu, rip);
+ break;
+ }
+
+ handled = vmx_exit_process(vmx, vcpu, vmexit);
+ vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+ } while (handled);
+
+ /*
+ * If a VM exit has been handled then the exitcode must be BOGUS
+ * If a VM exit is not handled then the exitcode must not be BOGUS
+ */
+ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+ (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+ panic("Mismatch between handled (%d) and exitcode (%d)",
+ handled, vmexit->exitcode);
+ }
+
+ VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+ /*
+ * XXX
+ * We need to do this to ensure that any VMCS state cached by the
+ * processor is flushed to memory. We need to do this in case the
+ * VM moves to a different cpu the next time it runs.
+ *
+ * Can we avoid doing this?
+ */
+ VMCLEAR(vmcs);
+ return (0);
+
+err_exit:
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.exit_reason = (uint32_t)-1;
+ vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+ vmexit->u.vmx.error = vie;
+ VMCLEAR(vmcs);
+ return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+ int error;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXXSMP we also need to clear the VMCS active on the other vcpus.
+ */
+ error = vmclear(&vmx->vmcs[0]);
+ if (error != 0)
+ panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+ ept_vmcleanup(vmx);
+ free(vmx, M_VMX);
+
+ return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_RAX:
+ return (&vmxctx->guest_rax);
+ case VM_REG_GUEST_RBX:
+ return (&vmxctx->guest_rbx);
+ case VM_REG_GUEST_RCX:
+ return (&vmxctx->guest_rcx);
+ case VM_REG_GUEST_RDX:
+ return (&vmxctx->guest_rdx);
+ case VM_REG_GUEST_RSI:
+ return (&vmxctx->guest_rsi);
+ case VM_REG_GUEST_RDI:
+ return (&vmxctx->guest_rdi);
+ case VM_REG_GUEST_RBP:
+ return (&vmxctx->guest_rbp);
+ case VM_REG_GUEST_R8:
+ return (&vmxctx->guest_r8);
+ case VM_REG_GUEST_R9:
+ return (&vmxctx->guest_r9);
+ case VM_REG_GUEST_R10:
+ return (&vmxctx->guest_r10);
+ case VM_REG_GUEST_R11:
+ return (&vmxctx->guest_r11);
+ case VM_REG_GUEST_R12:
+ return (&vmxctx->guest_r12);
+ case VM_REG_GUEST_R13:
+ return (&vmxctx->guest_r13);
+ case VM_REG_GUEST_R14:
+ return (&vmxctx->guest_r14);
+ case VM_REG_GUEST_R15:
+ return (&vmxctx->guest_r15);
+ default:
+ break;
+ }
+ return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *retval = *regp;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *regp = val;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+ struct vmx *vmx = arg;
+
+ if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ uint64_t ctls;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXX Allow caller to set contents of the guest registers saved in
+ * the 'vmxctx' even though the vcpu might be running. We need this
+ * specifically to support the rdmsr emulation that will set the
+ * %eax and %edx registers during vm exit processing.
+ */
+ if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+ if (error == 0) {
+ /*
+ * If the "load EFER" VM-entry control is 1 then the
+ * value of EFER.LMA must be identical to "IA-32e mode guest"
+ * bit in the VM-entry control.
+ */
+ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+ (reg == VM_REG_GUEST_EFER)) {
+ vmcs_getreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+ if (val & EFER_LMA)
+ ctls |= VM_ENTRY_GUEST_LMA;
+ else
+ ctls &= ~VM_ENTRY_GUEST_LMA;
+ vmcs_setreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+ }
+ }
+
+ return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+ int code_valid)
+{
+ int error;
+ uint64_t info;
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+ static uint32_t type_map[VM_EVENT_MAX] = {
+ 0x1, /* VM_EVENT_NONE */
+ 0x0, /* VM_HW_INTR */
+ 0x2, /* VM_NMI */
+ 0x3, /* VM_HW_EXCEPTION */
+ 0x4, /* VM_SW_INTR */
+ 0x5, /* VM_PRIV_SW_EXCEPTION */
+ 0x6, /* VM_SW_EXCEPTION */
+ };
+
+ /*
+ * If there is already an exception pending to be delivered to the
+ * vcpu then just return.
+ */
+ error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
+ if (error)
+ return (error);
+
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return (EAGAIN);
+
+ info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+ info |= VMCS_INTERRUPTION_INFO_VALID;
+ error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+ if (error != 0)
+ return (error);
+
+ if (code_valid) {
+ error = vmcs_setreg(vmcs,
+ VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+ code);
+ }
+ return (error);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+ struct vmx *vmx = arg;
+ int vcap;
+ int ret;
+
+ ret = ENOENT;
+
+ vcap = vmx->cap[vcpu].set;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit)
+ ret = 0;
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit)
+ ret = 0;
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap)
+ ret = 0;
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest)
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (ret == 0)
+ *retval = (vcap & (1 << type)) ? 1 : 0;
+
+ return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+ uint32_t baseval;
+ uint32_t *pptr;
+ int error;
+ int flag;
+ int reg;
+ int retval;
+
+ retval = ENOENT;
+ pptr = NULL;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_HLT_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_MTF;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_PAUSE_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest) {
+ retval = 0;
+ baseval = procbased_ctls2;
+ flag = PROCBASED2_UNRESTRICTED_GUEST;
+ reg = VMCS_SEC_PROC_BASED_CTLS;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retval == 0) {
+ if (val) {
+ baseval |= flag;
+ } else {
+ baseval &= ~flag;
+ }
+ VMPTRLD(vmcs);
+ error = vmwrite(reg, baseval);
+ VMCLEAR(vmcs);
+
+ if (error) {
+ retval = error;
+ } else {
+ /*
+ * Update optional stored flags, and record
+ * setting
+ */
+ if (pptr != NULL) {
+ *pptr = baseval;
+ }
+
+ if (val) {
+ vmx->cap[vcpu].set |= (1 << type);
+ } else {
+ vmx->cap[vcpu].set &= ~(1 << type);
+ }
+ }
+ }
+
+ return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+ vmx_init,
+ vmx_cleanup,
+ vmx_vminit,
+ vmx_run,
+ vmx_vmcleanup,
+ ept_vmmmap_set,
+ ept_vmmmap_get,
+ vmx_getreg,
+ vmx_setreg,
+ vmx_getdesc,
+ vmx_setdesc,
+ vmx_inject,
+ vmx_getcap,
+ vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..c7cd567
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define _VMX_H_
+
+#include "vmcs.h"
+
+#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
+
+struct vmxctx {
+ register_t tmpstk[32]; /* vmx_return() stack */
+ register_t tmpstktop;
+
+ register_t guest_rdi; /* Guest state */
+ register_t guest_rsi;
+ register_t guest_rdx;
+ register_t guest_rcx;
+ register_t guest_r8;
+ register_t guest_r9;
+ register_t guest_rax;
+ register_t guest_rbx;
+ register_t guest_rbp;
+ register_t guest_r10;
+ register_t guest_r11;
+ register_t guest_r12;
+ register_t guest_r13;
+ register_t guest_r14;
+ register_t guest_r15;
+ register_t guest_cr2;
+
+ register_t host_r15; /* Host state */
+ register_t host_r14;
+ register_t host_r13;
+ register_t host_r12;
+ register_t host_rbp;
+ register_t host_rsp;
+ register_t host_rbx;
+ register_t host_rip;
+ /*
+ * XXX todo debug registers and fpu state
+ */
+
+ int launched; /* vmcs launch state */
+ int launch_error;
+};
+
+struct vmxcap {
+ int set;
+ uint32_t proc_ctls;
+};
+
+struct vmxstate {
+ int lastcpu; /* host cpu that this 'vcpu' last ran on */
+ uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+ pml4_entry_t pml4ept[NPML4EPG];
+ struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
+ char msr_bitmap[PAGE_SIZE];
+ struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+ struct vmxctx ctx[VM_MAXCPU];
+ struct vmxcap cap[VM_MAXCPU];
+ struct vmxstate state[VM_MAXCPU];
+ struct vm *vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define VMX_RETURN_DIRECT 0
+#define VMX_RETURN_LONGJMP 1
+#define VMX_RETURN_VMRESUME 2
+#define VMX_RETURN_VMLAUNCH 3
+#define VMX_RETURN_AST 4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int vmx_setjmp(struct vmxctx *ctx);
+void vmx_longjmp(void); /* returns via vmx_setjmp */
+void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+
+u_long vmx_fix_cr0(u_long cr0);
+u_long vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define _VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define PINBASED_EXTINT_EXITING (1 << 0)
+#define PINBASED_NMI_EXITING (1 << 3)
+#define PINBASED_VIRTUAL_NMI (1 << 5)
+#define PINBASED_PREMPTION_TIMER (1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
+#define PROCBASED_TSC_OFFSET (1 << 3)
+#define PROCBASED_HLT_EXITING (1 << 7)
+#define PROCBASED_INVLPG_EXITING (1 << 9)
+#define PROCBASED_MWAIT_EXITING (1 << 10)
+#define PROCBASED_RDPMC_EXITING (1 << 11)
+#define PROCBASED_RDTSC_EXITING (1 << 12)
+#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
+#define PROCBASED_CR3_STORE_EXITING (1 << 16)
+#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
+#define PROCBASED_CR8_STORE_EXITING (1 << 20)
+#define PROCBASED_USE_TPR_SHADOW (1 << 21)
+#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
+#define PROCBASED_MOV_DR_EXITING (1 << 23)
+#define PROCBASED_IO_EXITING (1 << 24)
+#define PROCBASED_IO_BITMAPS (1 << 25)
+#define PROCBASED_MTF (1 << 27)
+#define PROCBASED_MSR_BITMAPS (1 << 28)
+#define PROCBASED_MONITOR_EXITING (1 << 29)
+#define PROCBASED_PAUSE_EXITING (1 << 30)
+#define PROCBASED_SECONDARY_CONTROLS (1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
+#define PROCBASED2_ENABLE_EPT (1 << 1)
+#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
+#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
+#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
+#define PROCBASED2_ENABLE_VPID (1 << 5)
+#define PROCBASED2_WBINVD_EXITING (1 << 6)
+#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
+#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
+
+/* VM Exit Controls */
+#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
+#define VM_EXIT_HOST_LMA (1 << 9)
+#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
+#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
+#define VM_EXIT_SAVE_PAT (1 << 18)
+#define VM_EXIT_LOAD_PAT (1 << 19)
+#define VM_EXIT_SAVE_EFER (1 << 20)
+#define VM_EXIT_LOAD_EFER (1 << 21)
+#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
+
+/* VM Entry Controls */
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
+#define VM_ENTRY_GUEST_LMA (1 << 9)
+#define VM_ENTRY_INTO_SMM (1 << 10)
+#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
+#define VM_ENTRY_LOAD_PAT (1 << 14)
+#define VM_ENTRY_LOAD_EFER (1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..2e66443
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CPUFUNC_H_
+#define _VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ * error
+ * VMsucceed 0
+ * VMFailInvalid 1
+ * VMFailValid 2 see also VMCS VM-Instruction Error Field
+ */
+#define VM_SUCCESS 0
+#define VM_FAIL_INVALID 1
+#define VM_FAIL_VALID 2
+#define VMX_SET_ERROR_CODE \
+ " jnc 1f;" \
+ " mov $1, %[error];" /* CF: error = 1 */ \
+ " jmp 3f;" \
+ "1: jnz 2f;" \
+ " mov $2, %[error];" /* ZF: error = 2 */ \
+ " jmp 3f;" \
+ "2: mov $0, %[error];" \
+ "3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(region);
+ __asm __volatile("vmxon %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+
+ return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmclear %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+ __asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+ __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmptrld %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+ int error;
+
+ __asm __volatile("vmwrite %[val], %[reg];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [val] "r" (val), [reg] "r" (reg)
+ : "memory");
+
+ return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+ int error;
+
+ __asm __volatile("vmread %[r], %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [r] "r" (r), [addr] "m" (*addr)
+ : "memory");
+
+ return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+ int err;
+
+ err = vmclear(vmcs);
+ if (err != 0)
+ panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+ critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+ int err;
+
+ critical_enter();
+
+ err = vmptrld(vmcs);
+ if (err != 0)
+ panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define INVVPID_TYPE_ADDRESS 0UL
+#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
+#define INVVPID_TYPE_ALL_CONTEXTS 2UL
+
+struct invvpid_desc {
+ uint16_t vpid;
+ uint16_t _res1;
+ uint32_t _res2;
+ uint64_t linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+ int error;
+
+ __asm __volatile("invvpid %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invvpid error %d", error);
+}
+
+#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
+#define INVEPT_TYPE_ALL_CONTEXTS 2UL
+struct invept_desc {
+ uint64_t eptp;
+ uint64_t _res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+ int error;
+
+ __asm __volatile("invept %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..823a05d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS, VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
+ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..2aba63c
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+ if (msr_val & (1UL << (bitpos + 32)))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+ if ((msr_val & (1UL << bitpos)) == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+ return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval)
+{
+ int i;
+ uint64_t val, trueval;
+ boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+ /* We cannot ask the same bit to be set to both '1' and '0' */
+ if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+ return (EINVAL);
+
+ if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+ true_ctls_avail = TRUE;
+ else
+ true_ctls_avail = FALSE;
+
+ val = rdmsr(ctl_reg);
+ if (true_ctls_avail)
+ trueval = rdmsr(true_ctl_reg); /* step c */
+ else
+ trueval = val; /* step a */
+
+ for (i = 0; i < 32; i++) {
+ one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+ zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+ KASSERT(one_allowed || zero_allowed,
+ ("invalid zero/one setting for bit %d of ctl 0x%0x, "
+ "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+ if (zero_allowed && !one_allowed) { /* b(i),c(i) */
+ if (ones_mask & (1 << i))
+ return (EINVAL);
+ *retval &= ~(1 << i);
+ } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
+ if (zeros_mask & (1 << i))
+ return (EINVAL);
+ *retval |= 1 << i;
+ } else {
+ if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
+ *retval &= ~(1 << i);
+ else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+ *retval |= 1 << i;
+ else if (!true_ctls_avail)
+ *retval &= ~(1 << i); /* b(iii) */
+ else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+ *retval &= ~(1 << i);
+ else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+ *retval |= 1 << i;
+ else {
+ panic("vmx_set_ctlreg: unable to determine "
+ "correct value of ctl bit %d for msr "
+ "0x%0x and true msr 0x%0x", i, ctl_reg,
+ true_ctl_reg);
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+ memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+ int byte, bit;
+
+ if (msr <= 0x00001FFF)
+ byte = msr / 8;
+ else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+ byte = 1024 + (msr - 0xC0000000) / 8;
+ else
+ return (EINVAL);
+
+ bit = msr & 0x7;
+
+ if (access & MSR_BITMAP_ACCESS_READ)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ byte += 2048;
+ if (access & MSR_BITMAP_ACCESS_WRITE)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define _VMX_MSR_H_
+
+#define MSR_VMX_BASIC 0x480
+#define MSR_VMX_EPT_VPID_CAP 0x48C
+
+#define MSR_VMX_PROCBASED_CTLS 0x482
+#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
+
+#define MSR_VMX_PINBASED_CTLS 0x481
+#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
+
+#define MSR_VMX_PROCBASED_CTLS2 0x48B
+
+#define MSR_VMX_EXIT_CTLS 0x483
+#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
+
+#define MSR_VMX_ENTRY_CTLS 0x484
+#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
+
+#define MSR_VMX_CR0_FIXED0 0x486
+#define MSR_VMX_CR0_FIXED1 0x487
+
+#define MSR_VMX_CR4_FIXED0 0x488
+#define MSR_VMX_CR4_FIXED1 0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define MSR_BITMAP_ACCESS_NONE 0x0
+#define MSR_BITMAP_ACCESS_READ 0x1
+#define MSR_BITMAP_ACCESS_WRITE 0x2
+#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void msr_bitmap_initialize(char *bitmap);
+int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4ba582a
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define VMX_DISABLE_INTERRUPTS cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#define VMX_CHECK_AST \
+ movq PCPU(CURTHREAD),%rax; \
+ testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \
+ je 9f; \
+ movq $VMX_RETURN_AST,%rsi; \
+ movq %rdi,%rsp; \
+ addq $VMXCTX_TMPSTKTOP,%rsp; \
+ callq vmx_return; \
+9:
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define VMX_GUEST_RESTORE \
+ movq %rdi,%rsp; \
+ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
+ movq %rsi,%cr2; \
+ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
+ movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
+ movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
+ movq VMXCTX_GUEST_R8(%rdi),%r8; \
+ movq VMXCTX_GUEST_R9(%rdi),%r9; \
+ movq VMXCTX_GUEST_RAX(%rdi),%rax; \
+ movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
+ movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
+ movq VMXCTX_GUEST_R10(%rdi),%r10; \
+ movq VMXCTX_GUEST_R11(%rdi),%r11; \
+ movq VMXCTX_GUEST_R12(%rdi),%r12; \
+ movq VMXCTX_GUEST_R13(%rdi),%r13; \
+ movq VMXCTX_GUEST_R14(%rdi),%r14; \
+ movq VMXCTX_GUEST_R15(%rdi),%r15; \
+ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define VM_INSTRUCTION_ERROR(reg) \
+ jnc 1f; \
+ movl $VM_FAIL_INVALID,reg; /* CF is set */ \
+ jmp 3f; \
+1: jnz 2f; \
+ movl $VM_FAIL_VALID,reg; /* ZF is set */ \
+ jmp 3f; \
+2: movl $VM_SUCCESS,reg; \
+3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+ .text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+ movq (%rsp),%rax /* return address */
+ movq %r15,VMXCTX_HOST_R15(%rdi)
+ movq %r14,VMXCTX_HOST_R14(%rdi)
+ movq %r13,VMXCTX_HOST_R13(%rdi)
+ movq %r12,VMXCTX_HOST_R12(%rdi)
+ movq %rbp,VMXCTX_HOST_RBP(%rdi)
+ movq %rsp,VMXCTX_HOST_RSP(%rdi)
+ movq %rbx,VMXCTX_HOST_RBX(%rdi)
+ movq %rax,VMXCTX_HOST_RIP(%rdi)
+
+ /*
+ * XXX save host debug registers
+ */
+ movl $VMX_RETURN_DIRECT,%eax
+ ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+ /* Restore host context. */
+ movq VMXCTX_HOST_R15(%rdi),%r15
+ movq VMXCTX_HOST_R14(%rdi),%r14
+ movq VMXCTX_HOST_R13(%rdi),%r13
+ movq VMXCTX_HOST_R12(%rdi),%r12
+ movq VMXCTX_HOST_RBP(%rdi),%rbp
+ movq VMXCTX_HOST_RSP(%rdi),%rsp
+ movq VMXCTX_HOST_RBX(%rdi),%rbx
+ movq VMXCTX_HOST_RIP(%rdi),%rax
+ movq %rax,(%rsp) /* return address */
+
+ /*
+ * XXX restore host debug registers
+ */
+ movl %esi,%eax
+ ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+ /*
+ * Save guest state that is not automatically saved in the vmcs.
+ */
+ movq %rdi,VMXCTX_GUEST_RDI(%rsp)
+ movq %rsi,VMXCTX_GUEST_RSI(%rsp)
+ movq %rdx,VMXCTX_GUEST_RDX(%rsp)
+ movq %rcx,VMXCTX_GUEST_RCX(%rsp)
+ movq %r8,VMXCTX_GUEST_R8(%rsp)
+ movq %r9,VMXCTX_GUEST_R9(%rsp)
+ movq %rax,VMXCTX_GUEST_RAX(%rsp)
+ movq %rbx,VMXCTX_GUEST_RBX(%rsp)
+ movq %rbp,VMXCTX_GUEST_RBP(%rsp)
+ movq %r10,VMXCTX_GUEST_R10(%rsp)
+ movq %r11,VMXCTX_GUEST_R11(%rsp)
+ movq %r12,VMXCTX_GUEST_R12(%rsp)
+ movq %r13,VMXCTX_GUEST_R13(%rsp)
+ movq %r14,VMXCTX_GUEST_R14(%rsp)
+ movq %r15,VMXCTX_GUEST_R15(%rsp)
+
+ movq %cr2,%rdi
+ movq %rdi,VMXCTX_GUEST_CR2(%rsp)
+
+ movq %rsp,%rdi
+ movq $VMX_RETURN_LONGJMP,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmresume
+
+ /*
+ * Capture the reason why vmresume failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMRESUME,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmlaunch
+
+ /*
+ * Capture the reason why vmlaunch failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMLAUNCH,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..ef0e9bc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,677 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+ volatile uint32_t version;
+ volatile uint32_t res0;
+ volatile uint64_t cap;
+ volatile uint64_t ext_cap;
+ volatile uint32_t gcr;
+ volatile uint32_t gsr;
+ volatile uint64_t rta;
+ volatile uint64_t ccr;
+};
+
+#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
+#define VTD_CAP_ND(cap) ((cap) & 0x7)
+#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
+#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
+#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
+
+#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
+#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
+
+#define VTD_GCR_WBF (1 << 27)
+#define VTD_GCR_SRTP (1 << 30)
+#define VTD_GCR_TE (1 << 31)
+
+#define VTD_GSR_WBFS (1 << 27)
+#define VTD_GSR_RTPS (1 << 30)
+#define VTD_GSR_TES (1 << 31)
+
+#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
+#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
+
+#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
+#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
+#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
+#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
+#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
+#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
+#define VTD_IIR_DOMAIN_P 32
+
+#define VTD_ROOT_PRESENT 0x1
+#define VTD_CTX_PRESENT 0x1
+#define VTD_CTX_TT_ALL (1UL << 2)
+
+#define VTD_PTE_RD (1UL << 0)
+#define VTD_PTE_WR (1UL << 1)
+#define VTD_PTE_SUPERPAGE (1UL << 7)
+#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
+
+struct domain {
+ uint64_t *ptp; /* first level page table page */
+ int pt_levels; /* number of page table levels */
+ int addrwidth; /* 'AW' field in context entry */
+ int spsmask; /* supported super page sizes */
+ u_int id; /* domain id */
+ vm_paddr_t maxaddr; /* highest address to be mapped */
+ SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define DRHD_MAX_UNITS 8
+static int drhd_num;
+static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
+static int max_domains;
+typedef int (*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+ int units, nlbus;
+ uint16_t did, vid;
+ uint32_t miscsts, vtbar;
+
+ const int bus = 0;
+ const int slot = 20;
+ const int func = 0;
+
+ units = 0;
+
+ vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+ did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+ if (vid != 0x8086 || did != 0x342E)
+ goto done;
+
+ /*
+ * Check if this is a dual IOH configuration.
+ */
+ miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+ if (miscsts & (1 << 25))
+ nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+ else
+ nlbus = -1;
+
+ vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in legacy IOH is disabled!\n");
+
+ if (nlbus != -1) {
+ vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in non-legacy IOH is disabled!\n");
+ }
+done:
+ return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+ tylersburg_vtd_ident,
+ NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+ int nd;
+
+ nd = VTD_CAP_ND(vtdmap->cap);
+
+ switch (nd) {
+ case 0:
+ return (16);
+ case 1:
+ return (64);
+ case 2:
+ return (256);
+ case 3:
+ return (1024);
+ case 4:
+ return (4 * 1024);
+ case 5:
+ return (16 * 1024);
+ case 6:
+ return (64 * 1024);
+ default:
+ panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+ }
+}
+
+static u_int
+domain_id(void)
+{
+ u_int id;
+ struct domain *dom;
+
+ /* Skip domain id 0 - it is reserved when Caching Mode field is set */
+ for (id = 1; id < max_domains; id++) {
+ SLIST_FOREACH(dom, &domhead, next) {
+ if (dom->id == id)
+ break;
+ }
+ if (dom == NULL)
+ break; /* found it */
+ }
+
+ if (id >= max_domains)
+ panic("domain ids exhausted");
+
+ return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+ if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+ pmap_invalidate_cache();
+
+ if (VTD_CAP_RWBF(vtdmap->cap)) {
+ vtdmap->gcr = VTD_GCR_WBF;
+ while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+ ;
+ }
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+ vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+ while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+ ;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+ int offset;
+ volatile uint64_t *iotlb_reg, val;
+
+ vtd_wbflush(vtdmap);
+
+ offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+ iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+
+ *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+ VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+ while (1) {
+ val = *iotlb_reg;
+ if ((val & VTD_IIR_IVT) == 0)
+ break;
+ }
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = VTD_GCR_TE;
+ while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+ ;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = 0;
+ while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+ ;
+}
+
+static int
+vtd_init(void)
+{
+ int i, units;
+ struct vtdmap *vtdmap;
+ vm_paddr_t ctx_paddr;
+
+ for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+ units = (*drhd_ident_funcs[i])();
+ if (units > 0)
+ break;
+ }
+
+ if (units <= 0)
+ return (ENXIO);
+
+ drhd_num = units;
+ vtdmap = vtdmaps[0];
+
+ if (VTD_CAP_CM(vtdmap->cap) != 0)
+ panic("vtd_init: invalid caching mode");
+
+ max_domains = vtd_max_domains(vtdmap);
+
+ /*
+ * Set up the root-table to point to the context-entry tables
+ */
+ for (i = 0; i < 256; i++) {
+ ctx_paddr = vtophys(ctx_tables[i]);
+ if (ctx_paddr & PAGE_MASK)
+ panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+ root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+ }
+
+ return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_wbflush(vtdmap);
+
+ /* Update the root table address */
+ vtdmap->rta = vtophys(root_table);
+ vtdmap->gcr = VTD_GCR_SRTP;
+ while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+ ;
+
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+
+ vtd_translation_enable(vtdmap);
+ }
+}
+
+static void
+vtd_disable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_translation_disable(vtdmap);
+ }
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+ int idx;
+ uint64_t *ctxp;
+ struct domain *dom = arg;
+ vm_paddr_t pt_paddr;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ vtdmap = vtdmaps[0];
+ ctxp = ctx_tables[bus];
+ pt_paddr = vtophys(dom->ptp);
+ idx = (slot << 3 | func) * 2;
+
+ if (ctxp[idx] & VTD_CTX_PRESENT) {
+ panic("vtd_add_device: device %d/%d/%d is already owned by "
+ "domain %d", bus, slot, func,
+ (uint16_t)(ctxp[idx + 1] >> 8));
+ }
+
+ /*
+ * Order is important. The 'present' bit is set only after all fields
+ * of the context pointer are initialized.
+ */
+ ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+ if (VTD_ECAP_DI(vtdmap->ext_cap))
+ ctxp[idx] = VTD_CTX_TT_ALL;
+ else
+ ctxp[idx] = 0;
+
+ ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+ /*
+ * 'Not Present' entries are not cached in either the Context Cache
+ * or in the IOTLB, so there is no need to invalidate either of them.
+ */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+ int i, idx;
+ uint64_t *ctxp;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ ctxp = ctx_tables[bus];
+ idx = (slot << 3 | func) * 2;
+
+ /*
+ * Order is important. The 'present' bit is must be cleared first.
+ */
+ ctxp[idx] = 0;
+ ctxp[idx + 1] = 0;
+
+ /*
+ * Invalidate the Context Cache and the IOTLB.
+ *
+ * XXX use device-selective invalidation for Context Cache
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+#define CREATE_MAPPING 0
+#define REMOVE_MAPPING 1
+
+static uint64_t
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+ int remove)
+{
+ struct domain *dom;
+ int i, spshift, ptpshift, ptpindex, nlevels;
+ uint64_t spsize, *ptp;
+
+ dom = arg;
+ ptpindex = 0;
+ ptpshift = 0;
+
+ if (gpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+ if (hpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+ if (len & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - supported super page size
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = 48;
+ for (i = 3; i >= 0; i--) {
+ spsize = 1UL << spshift;
+ if ((dom->spsmask & (1 << i)) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ (len >= spsize)) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ ptp = dom->ptp;
+ nlevels = dom->pt_levels;
+ while (--nlevels >= 0) {
+ ptpshift = 12 + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift) {
+ break;
+ }
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create a downstream page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+ }
+
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+ panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+ /*
+ * Update the 'gpa' -> 'hpa' mapping
+ */
+ if (remove) {
+ ptp[ptpindex] = 0;
+ } else {
+ ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ /*
+ * Invalidate the IOTLB.
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+ struct domain *dom;
+ vm_paddr_t addr;
+ int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+ struct vtdmap *vtdmap;
+
+ if (drhd_num <= 0)
+ panic("vtd_create_domain: no dma remapping hardware available");
+
+ vtdmap = vtdmaps[0];
+
+ /*
+ * Calculate AGAW.
+ * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+ */
+ addr = 0;
+ for (gaw = 0; addr < maxaddr; gaw++)
+ addr = 1ULL << gaw;
+
+ res = (gaw - 12) % 9;
+ if (res == 0)
+ agaw = gaw;
+ else
+ agaw = gaw + 9 - res;
+
+ if (agaw > 64)
+ agaw = 64;
+
+ /*
+ * Select the smallest Supported AGAW and the corresponding number
+ * of page table levels.
+ */
+ pt_levels = 2;
+ sagaw = 30;
+ addrwidth = 0;
+ tmp = VTD_CAP_SAGAW(vtdmap->cap);
+ for (i = 0; i < 5; i++) {
+ if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+ break;
+ pt_levels++;
+ addrwidth++;
+ sagaw += 9;
+ if (sagaw > 64)
+ sagaw = 64;
+ }
+
+ if (i >= 5) {
+ panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+ VTD_CAP_SAGAW(vtdmap->cap), agaw);
+ }
+
+ dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+ dom->pt_levels = pt_levels;
+ dom->addrwidth = addrwidth;
+ dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+ dom->id = domain_id();
+ dom->maxaddr = maxaddr;
+ dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+ if ((uintptr_t)dom->ptp & PAGE_MASK)
+ panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+ SLIST_INSERT_HEAD(&domhead, dom, next);
+
+ return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+ int i;
+ uint64_t *nlp;
+
+ if (level > 1) {
+ for (i = 0; i < 512; i++) {
+ if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+ continue;
+ if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+ continue;
+ nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+ vtd_free_ptp(nlp, level - 1);
+ }
+ }
+
+ bzero(ptp, PAGE_SIZE);
+ free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+ struct domain *dom;
+
+ dom = arg;
+
+ SLIST_REMOVE(&domhead, dom, domain, next);
+ vtd_free_ptp(dom->ptp, dom->pt_levels);
+ free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+ vtd_init,
+ vtd_cleanup,
+ vtd_enable,
+ vtd_disable,
+ vtd_create_domain,
+ vtd_destroy_domain,
+ vtd_create_mapping,
+ vtd_remove_mapping,
+ vtd_add_device,
+ vtd_remove_device,
+ vtd_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 0000000..c8447cc
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+ if (ops != NULL)
+ return ((*ops->init)());
+ else
+ return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+ if (ops != NULL && iommu_avail)
+ (*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_domain)(maxaddr));
+ else
+ return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_mapping)(domain, gpa, hpa, len));
+ else
+ return (len); /* XXX */
+}
+
+static __inline uint64_t
+IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->remove_mapping)(domain, gpa, len));
+ else
+ return (len); /* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_INVALIDATE_TLB(void *domain)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->invalidate_tlb)(domain);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+ int error, bus, slot, func;
+ vm_paddr_t maxaddr;
+ const char *name;
+ device_t dev;
+
+ if (vmm_is_intel())
+ ops = &iommu_ops_intel;
+ else if (vmm_is_amd())
+ ops = &iommu_ops_amd;
+ else
+ ops = NULL;
+
+ error = IOMMU_INIT();
+ if (error)
+ return;
+
+ iommu_avail = TRUE;
+
+ /*
+ * Create a domain for the devices owned by the host
+ */
+ maxaddr = vmm_mem_maxaddr();
+ host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+ if (host_domain == NULL)
+ panic("iommu_init: unable to create a host domain");
+
+ /*
+ * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
+ * the host
+ */
+ iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+ for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+ for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+ for (func = 0; func <= PCI_FUNCMAX; func++) {
+ dev = pci_find_dbsf(0, bus, slot, func);
+ if (dev == NULL)
+ continue;
+
+ /* skip passthrough devices */
+ name = device_get_name(dev);
+ if (name != NULL && strcmp(name, "ppt") == 0)
+ continue;
+
+ /* everything else belongs to the host domain */
+ iommu_add_device(host_domain, bus, slot, func);
+ }
+ }
+ }
+ IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+ IOMMU_DISABLE();
+ IOMMU_DESTROY_DOMAIN(host_domain);
+ IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+ IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+ uint64_t mapped, remaining;
+
+ remaining = len;
+
+ while (remaining > 0) {
+ mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+ gpa += mapped;
+ hpa += mapped;
+ remaining -= mapped;
+ }
+}
+
+void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+ uint64_t unmapped, remaining;
+
+ remaining = len;
+
+ while (remaining > 0) {
+ unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
+ gpa += unmapped;
+ remaining -= unmapped;
+ }
+}
+
+void *
+iommu_host_domain(void)
+{
+
+ return (host_domain);
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_invalidate_tlb(void *domain)
+{
+
+ IOMMU_INVALIDATE_TLB(domain);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 0000000..d5c1d6e
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define _IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+ vm_paddr_t hpa, uint64_t len);
+typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
+ uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+typedef void (*iommu_invalidate_tlb_t)(void *dom);
+
+struct iommu_ops {
+ iommu_init_func_t init; /* module wide */
+ iommu_cleanup_func_t cleanup;
+ iommu_enable_func_t enable;
+ iommu_disable_func_t disable;
+
+ iommu_create_domain_t create_domain; /* domain-specific */
+ iommu_destroy_domain_t destroy_domain;
+ iommu_create_mapping_t create_mapping;
+ iommu_remove_mapping_t remove_mapping;
+ iommu_add_device_t add_device;
+ iommu_remove_device_t remove_device;
+ iommu_invalidate_tlb_t invalidate_tlb;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void iommu_init(void);
+void iommu_cleanup(void);
+void *iommu_host_domain(void);
+void *iommu_create_domain(vm_paddr_t maxaddr);
+void iommu_destroy_domain(void *dom);
+void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+ size_t len);
+void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
+void iommu_add_device(void *dom, int bus, int slot, int func);
+void iommu_remove_device(void *dom, int bus, int slot, int func);
+void iommu_invalidate_tlb(void *domain);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 0000000..fdf136b
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,610 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+/* XXX locking */
+
+#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1)
+#define MAX_MSIMSGS 32
+
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
+struct pptintr_arg { /* pptintr(pptintr_arg) */
+ struct pptdev *pptdev;
+ int vec;
+ int vcpu;
+};
+
+static struct pptdev {
+ device_t dev;
+ struct vm *vm; /* owner of this device */
+ struct vm_memory_segment mmio[MAX_MMIOSEGS];
+ struct {
+ int num_msgs; /* guest state */
+
+ int startrid; /* host state */
+ struct resource *res[MAX_MSIMSGS];
+ void *cookie[MAX_MSIMSGS];
+ struct pptintr_arg arg[MAX_MSIMSGS];
+ } msi;
+
+ struct {
+ int num_msgs;
+ int startrid;
+ int msix_table_rid;
+ struct resource *msix_table_res;
+ struct resource **res;
+ void **cookie;
+ struct pptintr_arg *arg;
+ } msix;
+} pptdevs[32];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+ int bus, slot, func;
+ struct pci_devinfo *dinfo;
+
+ dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+
+ /*
+ * To qualify as a pci passthrough device a device must:
+ * - be allowed by administrator to be used in this role
+ * - be an endpoint device
+ */
+ if (vmm_is_pptdev(bus, slot, func) &&
+ (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+ return (0);
+ else
+ return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+ int n;
+
+ if (num_pptdevs >= MAX_PPTDEVS) {
+ printf("ppt_attach: maximum number of pci passthrough devices "
+ "exceeded\n");
+ return (ENXIO);
+ }
+
+ n = num_pptdevs++;
+ pptdevs[n].dev = dev;
+
+ if (bootverbose)
+ device_printf(dev, "attached\n");
+
+ return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+ /*
+ * XXX check whether there are any pci passthrough devices assigned
+ * to guests before we allow this driver to detach.
+ */
+
+ return (0);
+}
+
+static device_method_t ppt_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, ppt_probe),
+ DEVMETHOD(device_attach, ppt_attach),
+ DEVMETHOD(device_detach, ppt_detach),
+ {0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+ device_t dev;
+ int i, b, s, f;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ dev = pptdevs[i].dev;
+ b = pci_get_bus(dev);
+ s = pci_get_slot(dev);
+ f = pci_get_function(dev);
+ if (bus == b && slot == s && func == f)
+ return (&pptdevs[i]);
+ }
+ return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+ int i;
+ struct vm_memory_segment *seg;
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0)
+ continue;
+ (void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+ bzero(seg, sizeof(struct vm_memory_segment));
+ }
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+ int i, rid;
+ void *cookie;
+ struct resource *res;
+
+ if (ppt->msi.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msi.num_msgs; i++) {
+ rid = ppt->msi.startrid + i;
+ res = ppt->msi.res[i];
+ cookie = ppt->msi.cookie[i];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msi.res[i] = NULL;
+ ppt->msi.cookie[i] = NULL;
+ }
+
+ if (ppt->msi.startrid == 1)
+ pci_release_msi(ppt->dev);
+
+ ppt->msi.num_msgs = 0;
+}
+
+static void
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+ int rid;
+ struct resource *res;
+ void *cookie;
+
+ rid = ppt->msix.startrid + idx;
+ res = ppt->msix.res[idx];
+ cookie = ppt->msix.cookie[idx];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msix.res[idx] = NULL;
+ ppt->msix.cookie[idx] = NULL;
+}
+
+static void
+ppt_teardown_msix(struct pptdev *ppt)
+{
+ int i;
+
+ if (ppt->msix.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msix.num_msgs; i++)
+ ppt_teardown_msix_intr(ppt, i);
+
+ if (ppt->msix.msix_table_res) {
+ bus_release_resource(ppt->dev, SYS_RES_MEMORY,
+ ppt->msix.msix_table_rid,
+ ppt->msix.msix_table_res);
+ ppt->msix.msix_table_res = NULL;
+ ppt->msix.msix_table_rid = 0;
+ }
+
+ free(ppt->msix.res, M_PPTMSIX);
+ free(ppt->msix.cookie, M_PPTMSIX);
+ free(ppt->msix.arg, M_PPTMSIX);
+
+ pci_release_msi(ppt->dev);
+
+ ppt->msix.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is owned by a different VM then we
+ * cannot change its owner.
+ */
+ if (ppt->vm != NULL && ppt->vm != vm)
+ return (EBUSY);
+
+ ppt->vm = vm;
+ iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is not owned by this 'vm' then bail out.
+ */
+ if (ppt->vm != vm)
+ return (EBUSY);
+ ppt_unmap_mmio(vm, ppt);
+ ppt_teardown_msi(ppt);
+ ppt_teardown_msix(ppt);
+ iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+ ppt->vm = NULL;
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+ int i, bus, slot, func;
+ device_t dev;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ if (pptdevs[i].vm == vm) {
+ dev = pptdevs[i].dev;
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+ ppt_unassign_device(vm, bus, slot, func);
+ }
+ }
+
+ return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ int i, error;
+ struct vm_memory_segment *seg;
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ if (ppt->vm != vm)
+ return (EBUSY);
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0) {
+ error = vm_map_mmio(vm, gpa, len, hpa);
+ if (error == 0) {
+ seg->gpa = gpa;
+ seg->len = len;
+ }
+ return (error);
+ }
+ }
+ return (ENOSPC);
+ }
+ return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+ int vec;
+ struct pptdev *ppt;
+ struct pptintr_arg *pptarg;
+
+ pptarg = arg;
+ ppt = pptarg->pptdev;
+ vec = pptarg->vec;
+
+ if (ppt->vm != NULL)
+ (void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
+ else {
+ /*
+ * XXX
+ * This is not expected to happen - panic?
+ */
+ }
+
+ /*
+ * For legacy interrupts give other filters a chance in case
+ * the interrupt was not generated by the passthrough device.
+ */
+ if (ppt->msi.startrid == 0)
+ return (FILTER_STRAY);
+ else
+ return (FILTER_HANDLED);
+}
+
+/*
+ * XXX
+ * When we try to free the MSI resource the kernel will bind the thread to
+ * the host cpu was originally handling the MSI. The function freeing the
+ * MSI vector (apic_free_vector()) will panic the kernel if the thread
+ * is already bound to a cpu.
+ *
+ * So, we temporarily unbind the vcpu thread before freeing the MSI resource.
+ */
+static void
+PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
+{
+ int pincpu = -1;
+
+ vm_get_pinning(vm, vcpu, &pincpu);
+
+ if (pincpu >= 0)
+ vm_set_pinning(vm, vcpu, -1);
+
+ ppt_teardown_msi(ppt);
+
+ if (pincpu >= 0)
+ vm_set_pinning(vm, vcpu, pincpu);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec)
+{
+ int i, rid, flags;
+ int msi_count, startrid, error, tmp;
+ struct pptdev *ppt;
+
+ if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+ (vector < 0 || vector > 255) ||
+ (numvec < 0 || numvec > MAX_MSIMSGS))
+ return (EINVAL);
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ /* Free any allocated resources */
+ PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+
+ if (numvec == 0) /* nothing more to do */
+ return (0);
+
+ flags = RF_ACTIVE;
+ msi_count = pci_msi_count(ppt->dev);
+ if (msi_count == 0) {
+ startrid = 0; /* legacy interrupt */
+ msi_count = 1;
+ flags |= RF_SHAREABLE;
+ } else
+ startrid = 1; /* MSI */
+
+ /*
+ * The device must be capable of supporting the number of vectors
+ * the guest wants to allocate.
+ */
+ if (numvec > msi_count)
+ return (EINVAL);
+
+ /*
+ * Make sure that we can allocate all the MSI vectors that are needed
+ * by the guest.
+ */
+ if (startrid == 1) {
+ tmp = numvec;
+ error = pci_alloc_msi(ppt->dev, &tmp);
+ if (error)
+ return (error);
+ else if (tmp != numvec) {
+ pci_release_msi(ppt->dev);
+ return (ENOSPC);
+ } else {
+ /* success */
+ }
+ }
+
+ ppt->msi.startrid = startrid;
+
+ /*
+ * Allocate the irq resource and attach it to the interrupt handler.
+ */
+ for (i = 0; i < numvec; i++) {
+ ppt->msi.num_msgs = i + 1;
+ ppt->msi.cookie[i] = NULL;
+
+ rid = startrid + i;
+ ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, flags);
+ if (ppt->msi.res[i] == NULL)
+ break;
+
+ ppt->msi.arg[i].pptdev = ppt;
+ ppt->msi.arg[i].vec = vector + i;
+ ppt->msi.arg[i].vcpu = destcpu;
+
+ error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+ INTR_TYPE_NET | INTR_MPSAFE,
+ pptintr, NULL, &ppt->msi.arg[i],
+ &ppt->msi.cookie[i]);
+ if (error != 0)
+ break;
+ }
+
+ if (i < numvec) {
+ PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+ struct pptdev *ppt;
+ struct pci_devinfo *dinfo;
+ int numvec, alloced, rid, error;
+ size_t res_size, cookie_size, arg_size;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ dinfo = device_get_ivars(ppt->dev);
+ if (!dinfo)
+ return (ENXIO);
+
+ /*
+ * First-time configuration:
+ * Allocate the MSI-X table
+ * Allocate the IRQ resources
+ * Set up some variables in ppt->msix
+ */
+ if (ppt->msix.num_msgs == 0) {
+ numvec = pci_msix_count(ppt->dev);
+ if (numvec <= 0)
+ return (EINVAL);
+
+ ppt->msix.startrid = 1;
+ ppt->msix.num_msgs = numvec;
+
+ res_size = numvec * sizeof(ppt->msix.res[0]);
+ cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+ arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+ ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+ ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
+ M_WAITOK | M_ZERO);
+ ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+
+ rid = dinfo->cfg.msix.msix_table_bar;
+ ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
+ SYS_RES_MEMORY, &rid, RF_ACTIVE);
+
+ if (ppt->msix.msix_table_res == NULL) {
+ ppt_teardown_msix(ppt);
+ return (ENOSPC);
+ }
+ ppt->msix.msix_table_rid = rid;
+
+ alloced = numvec;
+ error = pci_alloc_msix(ppt->dev, &alloced);
+ if (error || alloced != numvec) {
+ ppt_teardown_msix(ppt);
+ return (error == 0 ? ENOSPC: error);
+ }
+ }
+
+ if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* Tear down the IRQ if it's already set up */
+ ppt_teardown_msix_intr(ppt, idx);
+
+ /* Allocate the IRQ resource */
+ ppt->msix.cookie[idx] = NULL;
+ rid = ppt->msix.startrid + idx;
+ ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, RF_ACTIVE);
+ if (ppt->msix.res[idx] == NULL)
+ return (ENXIO);
+
+ ppt->msix.arg[idx].pptdev = ppt;
+ ppt->msix.arg[idx].vec = msg;
+ ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+
+ /* Setup the MSI-X interrupt */
+ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+ INTR_TYPE_NET | INTR_MPSAFE,
+ pptintr, NULL, &ppt->msix.arg[idx],
+ &ppt->msix.cookie[idx]);
+
+ if (error != 0) {
+ bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+ ppt->msix.cookie[idx] = NULL;
+ ppt->msix.res[idx] = NULL;
+ return (ENXIO);
+ }
+ } else {
+ /* Masked, tear it down if it's already been set up */
+ ppt_teardown_msix_intr(ppt, idx);
+ }
+
+ return (0);
+}
+
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 0000000..63c8228
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define _IO_PPT_H_
+
+int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_all(struct vm *vm);
+int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec);
+int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 0000000..cd6c5d1
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+ SLIST_ENTRY(vdev) entry;
+ struct vdev_ops *ops;
+ void *dev;
+};
+static SLIST_HEAD(, vdev) vdev_head;
+static int vdev_count;
+
+struct vdev_region {
+ SLIST_ENTRY(vdev_region) entry;
+ struct vdev_ops *ops;
+ void *dev;
+ struct io_region *io;
+};
+static SLIST_HEAD(, vdev_region) region_head;
+static int region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT (0)
+#define VDEV_RESET (1)
+#define VDEV_HALT (2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+ struct vdev *vd;
+ int rc;
+
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+ switch (event) {
+ case VDEV_INIT:
+ rc = vd->ops->init(vd->dev);
+ break;
+ case VDEV_RESET:
+ rc = vd->ops->reset(vd->dev);
+ break;
+ case VDEV_HALT:
+ rc = vd->ops->halt(vd->dev);
+ break;
+ default:
+ break;
+ }
+ if (rc) {
+ printf("vdev %s init failed rc=%d\n",
+ vd->ops->name, rc);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+int
+vdev_init(void)
+{
+ return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+ return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+ return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+ SLIST_INIT(&vdev_head);
+ vdev_count = 0;
+
+ SLIST_INIT(&region_head);
+ region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+ struct vdev *vd;
+
+ // TODO: locking
+ while (!SLIST_EMPTY(&vdev_head)) {
+ vd = SLIST_FIRST(&vdev_head);
+ SLIST_REMOVE_HEAD(&vdev_head, entry);
+ free(vd, M_VDEV);
+ vdev_count--;
+ }
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+ struct vdev *vd;
+ vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO);
+ vd->ops = ops;
+ vd->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&vdev_head, vd, entry);
+ vdev_count++;
+ return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+ struct vdev *vd, *found;
+
+ found = NULL;
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ if (vd->dev == dev) {
+ found = vd;
+ }
+ }
+
+ if (found) {
+ SLIST_REMOVE(&vdev_head, found, vdev, entry);
+ free(found, M_VDEV);
+ }
+}
+
+#define IN_RANGE(val, start, end) \
+ (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev)
+{
+ struct vdev_region *region, *found;
+ uint64_t region_base;
+ uint64_t region_end;
+
+ found = NULL;
+
+ // TODO: locking
+ // FIXME: we should verify we are in the context the current
+ // vcpu here as well.
+ SLIST_FOREACH(region, &region_head, entry) {
+ region_base = region->io->base;
+ region_end = region_base + region->io->len;
+ if (IN_RANGE(io->base, region_base, region_end) &&
+ IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+ (dev && dev == region->dev)) {
+ found = region;
+ break;
+ }
+ }
+ return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+ if (region) {
+ return -EEXIST;
+ }
+
+ region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+ region->io = io;
+ region->ops = ops;
+ region->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&region_head, region, entry);
+ region_count++;
+
+ return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+
+ if (region) {
+ SLIST_REMOVE(&region_head, region, vdev_region, entry);
+ free(region, M_VDEV);
+ region_count--;
+ }
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+ struct vdev_region *region;
+ struct io_region io;
+ region_attr_t attr;
+ int rc;
+
+ io.base = gpa;
+ io.len = size;
+
+ region = vdev_find_region(&io, NULL);
+ if (!region)
+ return -EINVAL;
+
+ attr = (read) ? MMIO_READ : MMIO_WRITE;
+ if (!(region->io->attr & attr))
+ return -EPERM;
+
+ if (read)
+ rc = region->ops->memread(region->dev, gpa, size, data);
+ else
+ rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+ return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+ return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 0000000..6feeba8
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define _VDEV_H_
+
+typedef enum {
+ BYTE = 1,
+ WORD = 2,
+ DWORD = 4,
+ QWORD = 8,
+} opsize_t;
+
+typedef enum {
+ MMIO_READ = 1,
+ MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+ uint64_t base;
+ uint64_t len;
+ region_attr_t attr;
+ int vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+ const char *name;
+ vdev_init_t init;
+ vdev_reset_t reset;
+ vdev_halt_t halt;
+ vdev_memread_t memread;
+ vdev_memwrite_t memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif /* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 0000000..15fc6c2
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,901 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/clock.h>
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define VLAPIC_CTR0(vlapic, format) \
+ VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define VLAPIC_CTR1(vlapic, format, p1) \
+ VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define VLAPIC_CTR_IRR(vlapic, msg) \
+do { \
+ uint32_t *irrptr = &(vlapic)->apic.irr0; \
+ irrptr[0] = irrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
+} while (0)
+
+#define VLAPIC_CTR_ISR(vlapic, msg) \
+do { \
+ uint32_t *isrptr = &(vlapic)->apic.isr0; \
+ isrptr[0] = isrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define PRIO(x) ((x) >> 4)
+
+#define VLAPIC_VERSION (16)
+#define VLAPIC_MAXLVT_ENTRIES (5)
+
+#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
+
+enum boot_state {
+ BS_INIT,
+ BS_SIPI,
+ BS_RUNNING
+};
+
+struct vlapic {
+ struct vm *vm;
+ int vcpuid;
+
+ struct io_region *mmio;
+ struct vdev_ops *ops;
+ struct LAPIC apic;
+
+ int esr_update;
+
+ int divisor;
+ int ccr_ticks;
+
+ /*
+ * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+ * A vector is popped from the stack when the processor does an EOI.
+ * The vector on the top of the stack is used to compute the
+ * Processor Priority in conjunction with the TPR.
+ */
+ uint8_t isrvec_stk[ISRVEC_STK_SIZE];
+ int isrvec_stk_top;
+
+ uint64_t msr_apicbase;
+ enum boot_state boot_state;
+};
+
+#define VLAPIC_BUS_FREQ tsc_freq
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+ switch (dcr & 0xB) {
+ case APIC_TDCR_2:
+ return (2);
+ case APIC_TDCR_4:
+ return (4);
+ case APIC_TDCR_8:
+ return (8);
+ case APIC_TDCR_16:
+ return (16);
+ case APIC_TDCR_32:
+ return (32);
+ case APIC_TDCR_64:
+ return (64);
+ case APIC_TDCR_128:
+ return (128);
+ default:
+ panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+ }
+}
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+ int i;
+ for (i = 0; i < num_lvt; i++) {
+ *lvts |= APIC_LVT_M;
+ lvts += 4;
+ }
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+ printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+ *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+ *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->esr = 0; // XXX
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->version = VLAPIC_VERSION;
+ lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+ lapic->dfr = 0xffffffff;
+ lapic->svr = APIC_SVR_VECTOR;
+ vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+
+ memset(lapic, 0, sizeof(*lapic));
+ lapic->apr = vlapic->vcpuid;
+ vlapic_init_ipi(vlapic);
+ vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
+
+ if (vlapic->vcpuid == 0)
+ vlapic->boot_state = BS_RUNNING; /* BSP */
+ else
+ vlapic->boot_state = BS_INIT; /* AP */
+
+ return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+ return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_unregister_region(vlapic, vlapic->mmio);
+ return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr;
+ int idx;
+
+ if (vector < 0 || vector >= 256)
+ panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+ idx = (vector / 32) * 4;
+ irrptr = &lapic->irr0;
+ atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+ uint32_t icr_timer;
+
+ icr_timer = vlapic->apic.icr_timer;
+
+ vlapic->ccr_ticks = ticks;
+ if (elapsed < icr_timer)
+ vlapic->apic.ccr_timer = icr_timer - elapsed;
+ else {
+ /*
+ * This can happen when the guest is trying to run its local
+ * apic timer higher that the setting of 'hz' in the host.
+ *
+ * We deal with this by running the guest local apic timer
+ * at the rate of the host's 'hz' setting.
+ */
+ vlapic->apic.ccr_timer = 0;
+ }
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int i;
+
+ if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+ panic("vlapic_get_lvt: invalid LVT\n");
+ }
+ i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+ return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+ int i;
+ uint32_t *isrptr;
+
+ isrptr = &vlapic->apic.isr0;
+ for (i = 0; i < 8; i++)
+ printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+ for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+ printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+ int isrvec, tpr, ppr;
+
+ /*
+ * Note that the value on the stack at index 0 is always 0.
+ *
+ * This is a placeholder for the value of ISRV when none of the
+ * bits is set in the ISRx registers.
+ */
+ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+ tpr = vlapic->apic.tpr;
+
+#if 1
+ {
+ int i, lastprio, curprio, vector, idx;
+ uint32_t *isrptr;
+
+ if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+ panic("isrvec_stk is corrupted: %d", isrvec);
+
+ /*
+ * Make sure that the priority of the nested interrupts is
+ * always increasing.
+ */
+ lastprio = -1;
+ for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+ curprio = PRIO(vlapic->isrvec_stk[i]);
+ if (curprio <= lastprio) {
+ dump_isrvec_stk(vlapic);
+ panic("isrvec_stk does not satisfy invariant");
+ }
+ lastprio = curprio;
+ }
+
+ /*
+ * Make sure that each bit set in the ISRx registers has a
+ * corresponding entry on the isrvec stack.
+ */
+ i = 1;
+ isrptr = &vlapic->apic.isr0;
+ for (vector = 0; vector < 256; vector++) {
+ idx = (vector / 32) * 4;
+ if (isrptr[idx] & (1 << (vector % 32))) {
+ if (i > vlapic->isrvec_stk_top ||
+ vlapic->isrvec_stk[i] != vector) {
+ dump_isrvec_stk(vlapic);
+ panic("ISR and isrvec_stk out of sync");
+ }
+ i++;
+ }
+ }
+ }
+#endif
+
+ if (PRIO(tpr) >= PRIO(isrvec))
+ ppr = tpr;
+ else
+ ppr = isrvec & 0xf0;
+
+ vlapic->apic.ppr = ppr;
+ VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *isrptr;
+ int i, idx, bitpos;
+
+ isrptr = &lapic->isr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ bitpos = fls(isrptr[idx]);
+ if (bitpos != 0) {
+ if (vlapic->isrvec_stk_top <= 0) {
+ panic("invalid vlapic isrvec_stk_top %d",
+ vlapic->isrvec_stk_top);
+ }
+ isrptr[idx] &= ~(1 << (bitpos - 1));
+ VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+ vlapic->isrvec_stk_top--;
+ vlapic_update_ppr(vlapic);
+ return;
+ }
+ }
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+ return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+ int vector;
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+ vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+ vlapic_set_intr_ready(vlapic, vector);
+ }
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+ int i;
+ cpuset_t dmask;
+ uint32_t dest, vec, mode;
+ struct vlapic *vlapic2;
+ struct vm_exit *vmexit;
+
+ if (x2apic(vlapic))
+ dest = icrval >> 32;
+ else
+ dest = icrval >> (32 + 24);
+ vec = icrval & APIC_VECTOR_MASK;
+ mode = icrval & APIC_DELMODE_MASK;
+
+ if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+ switch (icrval & APIC_DEST_MASK) {
+ case APIC_DEST_DESTFLD:
+ CPU_SETOF(dest, &dmask);
+ break;
+ case APIC_DEST_SELF:
+ CPU_SETOF(vlapic->vcpuid, &dmask);
+ break;
+ case APIC_DEST_ALLISELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ break;
+ case APIC_DEST_ALLESELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ CPU_CLR(vlapic->vcpuid, &dmask);
+ break;
+ }
+
+ while ((i = cpusetobj_ffs(&dmask)) != 0) {
+ i--;
+ CPU_CLR(i, &dmask);
+ if (mode == APIC_DELMODE_FIXED)
+ lapic_set_intr(vlapic->vm, i, vec);
+ else
+ vm_inject_nmi(vlapic->vm, i);
+ }
+
+ return (0); /* handled completely in the kernel */
+ }
+
+ if (mode == APIC_DELMODE_INIT) {
+ if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+ return (0);
+
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /* move from INIT to waiting-for-SIPI state */
+ if (vlapic2->boot_state == BS_INIT) {
+ vlapic2->boot_state = BS_SIPI;
+ }
+
+ return (0);
+ }
+ }
+
+ if (mode == APIC_DELMODE_STARTUP) {
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /*
+ * Ignore SIPIs in any state other than wait-for-SIPI
+ */
+ if (vlapic2->boot_state != BS_SIPI)
+ return (0);
+
+ vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+ vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+ vmexit->u.spinup_ap.vcpu = dest;
+ vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+ /*
+ * XXX this assumes that the startup IPI always succeeds
+ */
+ vlapic2->boot_state = BS_RUNNING;
+ vm_activate_cpu(vlapic2->vm, dest);
+
+ return (0);
+ }
+ }
+
+ /*
+ * This will cause a return to userland.
+ */
+ return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int idx, i, bitpos, vector;
+ uint32_t *irrptr, val;
+
+ irrptr = &lapic->irr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ val = atomic_load_acq_int(&irrptr[idx]);
+ bitpos = fls(val);
+ if (bitpos != 0) {
+ vector = i * 32 + (bitpos - 1);
+ if (PRIO(vector) > PRIO(lapic->ppr)) {
+ VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+ return (vector);
+ } else
+ break;
+ }
+ }
+ VLAPIC_CTR0(vlapic, "no pending intr");
+ return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr, *isrptr;
+ int idx, stk_top;
+
+ /*
+ * clear the ready bit for vector being accepted in irr
+ * and set the vector as in service in isr.
+ */
+ idx = (vector / 32) * 4;
+
+ irrptr = &lapic->irr0;
+ atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+ isrptr = &lapic->isr0;
+ isrptr[idx] |= 1 << (vector % 32);
+ VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+ /*
+ * Update the PPR
+ */
+ vlapic->isrvec_stk_top++;
+
+ stk_top = vlapic->isrvec_stk_top;
+ if (stk_top >= ISRVEC_STK_SIZE)
+ panic("isrvec_stk_top overflow %d", stk_top);
+
+ vlapic->isrvec_stk[stk_top] = vector;
+ vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int i;
+
+ if (offset > sizeof(*lapic)) {
+ *data = 0;
+ return 0;
+ }
+
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ if (x2apic(vlapic))
+ *data = vlapic->vcpuid;
+ else
+ *data = vlapic->vcpuid << 24;
+ break;
+ case APIC_OFFSET_VER:
+ *data = lapic->version;
+ break;
+ case APIC_OFFSET_TPR:
+ *data = lapic->tpr;
+ break;
+ case APIC_OFFSET_APR:
+ *data = lapic->apr;
+ break;
+ case APIC_OFFSET_PPR:
+ *data = lapic->ppr;
+ break;
+ case APIC_OFFSET_EOI:
+ *data = lapic->eoi;
+ break;
+ case APIC_OFFSET_LDR:
+ *data = lapic->ldr;
+ break;
+ case APIC_OFFSET_DFR:
+ *data = lapic->dfr;
+ break;
+ case APIC_OFFSET_SVR:
+ *data = lapic->svr;
+ break;
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ i = (offset - APIC_OFFSET_ISR0) >> 2;
+ reg = &lapic->isr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ i = (offset - APIC_OFFSET_TMR0) >> 2;
+ reg = &lapic->tmr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ i = (offset - APIC_OFFSET_IRR0) >> 2;
+ reg = &lapic->irr0;
+ *data = atomic_load_acq_int(reg + i);
+ break;
+ case APIC_OFFSET_ESR:
+ *data = lapic->esr;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ *data = lapic->icr_lo;
+ break;
+ case APIC_OFFSET_ICR_HI:
+ *data = lapic->icr_hi;
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ *data = *(reg);
+ break;
+ case APIC_OFFSET_ICR:
+ *data = lapic->icr_timer;
+ break;
+ case APIC_OFFSET_CCR:
+ *data = vlapic_get_ccr(vlapic);
+ break;
+ case APIC_OFFSET_DCR:
+ *data = lapic->dcr_timer;
+ break;
+ case APIC_OFFSET_RRR:
+ default:
+ *data = 0;
+ break;
+ }
+ return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int retval;
+
+ if (offset > sizeof(*lapic)) {
+ return 0;
+ }
+
+ retval = 0;
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ break;
+ case APIC_OFFSET_TPR:
+ lapic->tpr = data & 0xff;
+ vlapic_update_ppr(vlapic);
+ break;
+ case APIC_OFFSET_EOI:
+ vlapic_process_eoi(vlapic);
+ break;
+ case APIC_OFFSET_LDR:
+ break;
+ case APIC_OFFSET_DFR:
+ break;
+ case APIC_OFFSET_SVR:
+ lapic->svr = data;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ if (!x2apic(vlapic)) {
+ data &= 0xffffffff;
+ data |= (uint64_t)lapic->icr_hi << 32;
+ }
+ retval = lapic_process_icr(vlapic, data);
+ break;
+ case APIC_OFFSET_ICR_HI:
+ if (!x2apic(vlapic)) {
+ retval = 0;
+ lapic->icr_hi = data;
+ }
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ if (!(lapic->svr & APIC_SVR_ENABLE)) {
+ data |= APIC_LVT_M;
+ }
+ *reg = data;
+ // vlapic_dump_lvt(offset, reg);
+ break;
+ case APIC_OFFSET_ICR:
+ lapic->icr_timer = data;
+ vlapic_start_timer(vlapic, 0);
+ break;
+
+ case APIC_OFFSET_DCR:
+ lapic->dcr_timer = data;
+ vlapic->divisor = vlapic_timer_divisor(data);
+ break;
+
+ case APIC_OFFSET_ESR:
+ vlapic_update_errors(vlapic);
+ break;
+ case APIC_OFFSET_VER:
+ case APIC_OFFSET_APR:
+ case APIC_OFFSET_PPR:
+ case APIC_OFFSET_RRR:
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ case APIC_OFFSET_CCR:
+ default:
+ // Read only.
+ break;
+ }
+
+ return (retval);
+}
+
+int
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+ int curticks, delta, periodic, fired;
+ uint32_t ccr;
+ uint32_t decrement, leftover;
+
+restart:
+ curticks = ticks;
+ delta = curticks - vlapic->ccr_ticks;
+
+ /* Local APIC timer is disabled */
+ if (vlapic->apic.icr_timer == 0)
+ return (-1);
+
+ /* One-shot mode and timer has already counted down to zero */
+ periodic = vlapic_periodic_timer(vlapic);
+ if (!periodic && vlapic->apic.ccr_timer == 0)
+ return (-1);
+ /*
+ * The 'curticks' and 'ccr_ticks' are out of sync by more than
+ * 2^31 ticks. We deal with this by restarting the timer.
+ */
+ if (delta < 0) {
+ vlapic_start_timer(vlapic, 0);
+ goto restart;
+ }
+
+ fired = 0;
+ decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+
+ vlapic->ccr_ticks = curticks;
+ ccr = vlapic->apic.ccr_timer;
+
+ while (delta-- > 0) {
+ if (ccr > decrement) {
+ ccr -= decrement;
+ continue;
+ }
+
+ /* Trigger the local apic timer interrupt */
+ vlapic_fire_timer(vlapic);
+ if (periodic) {
+ leftover = decrement - ccr;
+ vlapic_start_timer(vlapic, leftover);
+ ccr = vlapic->apic.ccr_timer;
+ } else {
+ /*
+ * One-shot timer has counted down to zero.
+ */
+ ccr = 0;
+ }
+ fired = 1;
+ break;
+ }
+
+ vlapic->apic.ccr_timer = ccr;
+
+ if (!fired)
+ return ((ccr / decrement) + 1);
+ else
+ return (0);
+}
+
+struct vdev_ops vlapic_dev_ops = {
+ .name = "vlapic",
+ .init = vlapic_op_init,
+ .reset = vlapic_op_reset,
+ .halt = vlapic_op_halt,
+ .memread = vlapic_op_mem_read,
+ .memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+ struct vlapic *vlapic;
+
+ vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+ vlapic->vm = vm;
+ vlapic->vcpuid = vcpuid;
+
+ vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
+
+ if (vcpuid == 0)
+ vlapic->msr_apicbase |= APICBASE_BSP;
+
+ vlapic->ops = &vlapic_dev_ops;
+
+ vlapic->mmio = vlapic_mmio + vcpuid;
+ vlapic->mmio->base = DEFAULT_APIC_BASE;
+ vlapic->mmio->len = PAGE_SIZE;
+ vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+ vlapic->mmio->vcpu = vcpuid;
+
+ vdev_register(&vlapic_dev_ops, vlapic);
+
+ vlapic_op_init(vlapic);
+
+ return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+ vlapic_op_halt(vlapic);
+ vdev_unregister(vlapic);
+ free(vlapic, M_VLAPIC);
+}
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+ return (vlapic->msr_apicbase);
+}
+
+void
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
+{
+ int err;
+ enum x2apic_state state;
+
+ err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
+ if (err)
+ panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
+
+ if (state == X2APIC_DISABLED)
+ val &= ~APICBASE_X2APIC;
+
+ vlapic->msr_apicbase = val;
+}
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, vcpuid);
+
+ if (state == X2APIC_DISABLED)
+ vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 0000000..00de019
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define _VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+
+/*
+ * Map of APIC Registers: Offset Description Access
+ */
+#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W
+#define APIC_OFFSET_VER 0x30 // Local APIC Version R
+#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W
+#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R
+#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R
+#define APIC_OFFSET_EOI 0xB0 // EOI Register W
+#define APIC_OFFSET_RRR 0xC0 // Remote read R
+#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W
+#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W
+#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
+#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R
+#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R
+#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R
+#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R
+#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R
+#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R
+#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R
+#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R
+#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R
+#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R
+#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R
+#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R
+#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R
+#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R
+#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R
+#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R
+#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R
+#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R
+#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R
+#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R
+#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R
+#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R
+#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R
+#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R
+#define APIC_OFFSET_ESR 0x280 // Error Status Register R
+#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W
+#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W
+#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W
+#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W
+#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W
+#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W
+#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W
+#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R
+#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define ISRVEC_STK_SIZE (16 + 1)
+
+enum x2apic_state;
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+int vlapic_timer_tick(struct vlapic *vlapic);
+
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+
+#endif /* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 0000000..a4dea79
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,1022 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+ int flags;
+ enum vcpu_state state;
+ struct mtx mtx;
+ int pincpu; /* host cpuid this vcpu is bound to */
+ int hostcpu; /* host cpuid this vcpu last ran on */
+ uint64_t guest_msrs[VMM_MSR_NUM];
+ struct vlapic *vlapic;
+ int vcpuid;
+ struct savefpu *guestfpu; /* guest fpu state */
+ void *stats;
+ struct vm_exit exitinfo;
+ enum x2apic_state x2apic_state;
+ int nmi_pending;
+};
+#define VCPU_F_PINNED 0x0001
+
+#define VCPU_PINCPU(vm, vcpuid) \
+ ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
+
+#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
+
+#define VCPU_PIN(vm, vcpuid, host_cpuid) \
+do { \
+ vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \
+ vm->vcpu[vcpuid].pincpu = host_cpuid; \
+} while(0)
+
+#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
+#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
+
+#define VM_MAX_MEMORY_SEGMENTS 2
+
+struct vm {
+ void *cookie; /* processor-specific data */
+ void *iommu; /* iommu-specific data */
+ struct vcpu vcpu[VM_MAXCPU];
+ int num_mem_segs;
+ struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ char name[VM_MAX_NAMELEN];
+
+ /*
+ * Set of active vcpus.
+ * An active vcpu is one that has been started implicitly (BSP) or
+ * explicitly (AP) by sending it a startup ipi.
+ */
+ cpuset_t active_cpus;
+};
+
+static struct vmm_ops *ops;
+#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
+#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
+
+#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
+#define VMRUN(vmi, vcpu, rip) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
+ (ops != NULL ? \
+ (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
+ ENXIO)
+#define VMMMAP_GET(vmi, gpa) \
+ (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define VMGETREG(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETREG(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define VMGETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMSETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
+ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define VMGETCAP(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETCAP(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
+#define fpu_stop_emulating() clts()
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+ vlapic_cleanup(vcpu->vlapic);
+ vmm_stat_free(vcpu->stats);
+ fpu_save_area_free(vcpu->guestfpu);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpu_id];
+
+ vcpu_lock_init(vcpu);
+ vcpu->hostcpu = NOCPU;
+ vcpu->vcpuid = vcpu_id;
+ vcpu->vlapic = vlapic_init(vm, vcpu_id);
+ vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
+ vcpu->guestfpu = fpu_save_area_alloc();
+ fpu_save_area_reset(vcpu->guestfpu);
+ vcpu->stats = vmm_stat_alloc();
+}
+
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+ struct vcpu *vcpu;
+
+ if (cpuid < 0 || cpuid >= VM_MAXCPU)
+ panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+ vcpu = &vm->vcpu[cpuid];
+
+ return (&vcpu->exitinfo);
+}
+
+static int
+vmm_init(void)
+{
+ int error;
+
+ vmm_host_state_init();
+ vmm_ipi_init();
+
+ error = vmm_mem_init();
+ if (error)
+ return (error);
+
+ if (vmm_is_intel())
+ ops = &vmm_ops_intel;
+ else if (vmm_is_amd())
+ ops = &vmm_ops_amd;
+ else
+ return (ENXIO);
+
+ vmm_msr_init();
+
+ return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ vmmdev_init();
+ iommu_init();
+ error = vmm_init();
+ break;
+ case MOD_UNLOAD:
+ error = vmmdev_cleanup();
+ if (error == 0) {
+ iommu_cleanup();
+ vmm_ipi_cleanup();
+ error = VMM_CLEANUP();
+ }
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t vmm_kmod = {
+ "vmm",
+ vmm_handler,
+ NULL
+};
+
+/*
+ * Execute the module load handler after the pci passthru driver has had
+ * a chance to claim devices. We need this information at the time we do
+ * iommu initialization.
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+struct vm *
+vm_create(const char *name)
+{
+ int i;
+ struct vm *vm;
+ vm_paddr_t maxaddr;
+
+ const int BSP = 0;
+
+ if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+ return (NULL);
+
+ vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+ strcpy(vm->name, name);
+ vm->cookie = VMINIT(vm);
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vcpu_init(vm, i);
+ guest_msrs_init(vm, i);
+ }
+
+ maxaddr = vmm_mem_maxaddr();
+ vm->iommu = iommu_create_domain(maxaddr);
+ vm_activate_cpu(vm, BSP);
+
+ return (vm);
+}
+
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+ size_t len;
+ vm_paddr_t hpa;
+ void *host_domain;
+
+ host_domain = iommu_host_domain();
+
+ len = 0;
+ while (len < seg->len) {
+ hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+ if (hpa == (vm_paddr_t)-1) {
+ panic("vm_free_mem_segs: cannot free hpa "
+ "associated with gpa 0x%016lx", seg->gpa + len);
+ }
+
+ /*
+ * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+ * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+ */
+ iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+ iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+
+ vmm_mem_free(hpa, PAGE_SIZE);
+
+ len += PAGE_SIZE;
+ }
+
+ /*
+ * Invalidate cached translations associated with 'vm->iommu' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(vm->iommu);
+
+ bzero(seg, sizeof(struct vm_memory_segment));
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+ int i;
+
+ ppt_unassign_all(vm);
+
+ for (i = 0; i < vm->num_mem_segs; i++)
+ vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+ vm->num_mem_segs = 0;
+
+ for (i = 0; i < VM_MAXCPU; i++)
+ vcpu_cleanup(&vm->vcpu[i]);
+
+ iommu_destroy_domain(vm->iommu);
+
+ VMCLEANUP(vm->cookie);
+
+ free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+ return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+ VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+ VM_PROT_NONE, spok));
+}
+
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+ int i;
+ vm_paddr_t gpabase, gpalimit;
+
+ if (gpa & PAGE_MASK)
+ panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ gpabase = vm->mem_segs[i].gpa;
+ gpalimit = gpabase + vm->mem_segs[i].len;
+ if (gpa >= gpabase && gpa < gpalimit)
+ return (FALSE);
+ }
+
+ return (TRUE);
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ int error, available, allocated;
+ struct vm_memory_segment *seg;
+ vm_paddr_t g, hpa;
+ void *host_domain;
+
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+ return (EINVAL);
+
+ available = allocated = 0;
+ g = gpa;
+ while (g < gpa + len) {
+ if (vm_gpa_available(vm, g))
+ available++;
+ else
+ allocated++;
+
+ g += PAGE_SIZE;
+ }
+
+ /*
+ * If there are some allocated and some available pages in the address
+ * range then it is an error.
+ */
+ if (allocated && available)
+ return (EINVAL);
+
+ /*
+ * If the entire address range being requested has already been
+ * allocated then there isn't anything more to do.
+ */
+ if (allocated && available == 0)
+ return (0);
+
+ if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+ return (E2BIG);
+
+ host_domain = iommu_host_domain();
+
+ seg = &vm->mem_segs[vm->num_mem_segs];
+
+ error = 0;
+ seg->gpa = gpa;
+ seg->len = 0;
+ while (seg->len < len) {
+ hpa = vmm_mem_alloc(PAGE_SIZE);
+ if (hpa == 0) {
+ error = ENOMEM;
+ break;
+ }
+
+ error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+ VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+ if (error)
+ break;
+
+ /*
+ * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+ * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+ */
+ iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
+ iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+
+ seg->len += PAGE_SIZE;
+ }
+
+ if (error) {
+ vm_free_mem_seg(vm, seg);
+ return (error);
+ }
+
+ /*
+ * Invalidate cached translations associated with 'host_domain' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(host_domain);
+
+ vm->num_mem_segs++;
+
+ return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ vm_paddr_t nextpage;
+
+ nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+ if (len > nextpage - gpa)
+ panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+ return (VMMMAP_GET(vm->cookie, gpa));
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg)
+{
+ int i;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ if (gpabase == vm->mem_segs[i].gpa) {
+ *seg = vm->mem_segs[i];
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_IDTR:
+ case VM_REG_GUEST_GDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_ES:
+ case VM_REG_GUEST_CS:
+ case VM_REG_GUEST_SS:
+ case VM_REG_GUEST_DS:
+ case VM_REG_GUEST_FS:
+ case VM_REG_GUEST_GS:
+ case VM_REG_GUEST_TR:
+ case VM_REG_GUEST_LDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
+{
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ *cpuid = VCPU_PINCPU(vm, vcpuid);
+
+ return (0);
+}
+
+int
+vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
+{
+ struct thread *td;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ td = curthread; /* XXXSMP only safe when muxing vcpus */
+
+ /* unpin */
+ if (host_cpuid < 0) {
+ VCPU_UNPIN(vm, vcpuid);
+ thread_lock(td);
+ sched_unbind(td);
+ thread_unlock(td);
+ return (0);
+ }
+
+ if (CPU_ABSENT(host_cpuid))
+ return (EINVAL);
+
+ /*
+ * XXX we should check that 'host_cpuid' has not already been pinned
+ * by another vm.
+ */
+ thread_lock(td);
+ sched_bind(td, host_cpuid);
+ thread_unlock(td);
+ VCPU_PIN(vm, vcpuid, host_cpuid);
+
+ return (0);
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+
+ /* flush host state to the pcb */
+ fpuexit(curthread);
+
+ /* restore guest FPU state */
+ fpu_stop_emulating();
+ fpurestore(vcpu->guestfpu);
+
+ /*
+ * The FPU is now "dirty" with the guest's state so turn on emulation
+ * to trap any access to the FPU by the host.
+ */
+ fpu_start_emulating();
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+
+ if ((rcr0() & CR0_TS) == 0)
+ panic("fpu emulation not enabled in host!");
+
+ /* save guest FPU state */
+ fpu_stop_emulating();
+ fpusave(vcpu->guestfpu);
+ fpu_start_emulating();
+}
+
+static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+ int error, vcpuid, sleepticks, t;
+ struct vcpu *vcpu;
+ struct pcb *pcb;
+ uint64_t tscval, rip;
+ struct vm_exit *vme;
+
+ vcpuid = vmrun->cpuid;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vmrun->vm_exit;
+ rip = vmrun->rip;
+restart:
+ critical_enter();
+
+ tscval = rdtsc();
+
+ pcb = PCPU_GET(curpcb);
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+
+ restore_guest_msrs(vm, vcpuid);
+ restore_guest_fpustate(vcpu);
+
+ vcpu->hostcpu = curcpu;
+ error = VMRUN(vm->cookie, vcpuid, rip);
+ vcpu->hostcpu = NOCPU;
+
+ save_guest_fpustate(vcpu);
+ restore_host_msrs(vm, vcpuid);
+
+ vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+ /* copy the exit information */
+ bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
+
+ critical_exit();
+
+ /*
+ * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
+ * is ready to run.
+ */
+ if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
+ vcpu_lock(vcpu);
+
+ /*
+ * Figure out the number of host ticks until the next apic
+ * timer interrupt in the guest.
+ */
+ sleepticks = lapic_timer_tick(vm, vcpuid);
+
+ /*
+ * If the guest local apic timer is disabled then sleep for
+ * a long time but not forever.
+ */
+ if (sleepticks < 0)
+ sleepticks = hz;
+
+ /*
+ * Do a final check for pending NMI or interrupts before
+ * really putting this thread to sleep.
+ *
+ * These interrupts could have happened any time after we
+ * returned from VMRUN() and before we grabbed the vcpu lock.
+ */
+ if (!vm_nmi_pending(vm, vcpuid) &&
+ lapic_pending_intr(vm, vcpuid) < 0) {
+ if (sleepticks <= 0)
+ panic("invalid sleepticks %d", sleepticks);
+ t = ticks;
+ msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+ vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ }
+
+ vcpu_unlock(vcpu);
+
+ rip = vme->rip + vme->inst_length;
+ goto restart;
+ }
+
+ return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+ int vector, uint32_t code, int code_valid)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+ return (EINVAL);
+
+ if (vector < 0 || vector > 255)
+ return (EINVAL);
+
+ return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
+int
+vm_inject_nmi(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu->nmi_pending = 1;
+ vm_interrupt_hostcpu(vm, vcpuid);
+ return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpu->nmi_pending == 0)
+ panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+ vcpu->nmi_pending = 0;
+ vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+ int found, b, s, f, n;
+ char *val, *cp, *cp2;
+
+ /*
+ * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+ */
+ found = 0;
+ cp = val = getenv("pptdevs");
+ while (cp != NULL && *cp != '\0') {
+ if ((cp2 = strchr(cp, ' ')) != NULL)
+ *cp2 = '\0';
+
+ n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+ if (n == 3 && bus == b && slot == s && func == f) {
+ found = 1;
+ break;
+ }
+
+ if (cp2 != NULL)
+ *cp2++ = ' ';
+
+ cp = cp2;
+ }
+ freeenv(val);
+ return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+ return (vm->iommu);
+}
+
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+{
+ int error;
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+
+ /*
+ * The following state transitions are allowed:
+ * IDLE -> RUNNING -> IDLE
+ * IDLE -> CANNOT_RUN -> IDLE
+ */
+ if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
+ (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
+ error = 0;
+ vcpu->state = state;
+ } else {
+ error = EBUSY;
+ }
+
+ vcpu_unlock(vcpu);
+
+ return (error);
+}
+
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+ enum vcpu_state state;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ state = vcpu->state;
+ vcpu_unlock(vcpu);
+
+ return (state);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+ if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+ CPU_SET(vcpuid, &vm->active_cpus);
+}
+
+cpuset_t
+vm_active_cpus(struct vm *vm)
+{
+
+ return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+ return (vm->vcpu[vcpuid].stats);
+}
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ *state = vm->vcpu[vcpuid].x2apic_state;
+
+ return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (state < 0 || state >= X2APIC_STATE_LAST)
+ return (EINVAL);
+
+ vm->vcpu[vcpuid].x2apic_state = state;
+
+ vlapic_set_x2apic_state(vm, vcpuid, state);
+
+ return (0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
+{
+ int hostcpu;
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ hostcpu = vcpu->hostcpu;
+ if (hostcpu == NOCPU) {
+ /*
+ * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
+ * the host thread must be sleeping waiting for an event to
+ * kick the vcpu out of 'hlt'.
+ *
+ * XXX this is racy because the condition exists right before
+ * and after calling VMRUN() in vm_run(). The wakeup() is
+ * benign in this case.
+ */
+ if (vcpu->state == VCPU_RUNNING)
+ wakeup_one(vcpu);
+ } else {
+ if (vcpu->state != VCPU_RUNNING)
+ panic("invalid vcpu state %d", vcpu->state);
+ if (hostcpu != curcpu)
+ ipi_cpu(hostcpu, vmm_ipinum);
+ }
+ vcpu_unlock(vcpu);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 0000000..0150ebd
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,538 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_mem.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+ struct vm *vm; /* vm instance cookie */
+ struct cdev *cdev;
+ SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+ struct vmmdev_softc *sc;
+
+#ifdef notyet /* XXX kernel is not compiled with invariants */
+ mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+ SLIST_FOREACH(sc, &head, link) {
+ if (strcmp(name, vm_name(sc->vm)) == 0)
+ break;
+ }
+
+ return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+
+ return (cdev->si_drv1);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+ int error, off, c;
+ vm_paddr_t hpa, gpa;
+ struct vmmdev_softc *sc;
+
+ static char zerobuf[PAGE_SIZE];
+
+ error = 0;
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup2(cdev);
+ if (sc == NULL)
+ error = ENXIO;
+
+ while (uio->uio_resid > 0 && error == 0) {
+ gpa = uio->uio_offset;
+ off = gpa & PAGE_MASK;
+ c = min(uio->uio_resid, PAGE_SIZE - off);
+
+ /*
+ * The VM has a hole in its physical memory map. If we want to
+ * use 'dd' to inspect memory beyond the hole we need to
+ * provide bogus data for memory that lies in the hole.
+ *
+ * Since this device does not support lseek(2), dd(1) will
+ * read(2) blocks of data to simulate the lseek(2).
+ */
+ hpa = vm_gpa2hpa(sc->vm, gpa, c);
+ if (hpa == (vm_paddr_t)-1) {
+ if (uio->uio_rw == UIO_READ)
+ error = uiomove(zerobuf, c, uio);
+ else
+ error = EFAULT;
+ } else
+ error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+ return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+ struct thread *td)
+{
+ int error, vcpu, state_changed;
+ enum vcpu_state new_state;
+ struct vmmdev_softc *sc;
+ struct vm_memory_segment *seg;
+ struct vm_register *vmreg;
+ struct vm_seg_desc* vmsegdesc;
+ struct vm_pin *vmpin;
+ struct vm_run *vmrun;
+ struct vm_event *vmevent;
+ struct vm_lapic_irq *vmirq;
+ struct vm_capability *vmcap;
+ struct vm_pptdev *pptdev;
+ struct vm_pptdev_mmio *pptmmio;
+ struct vm_pptdev_msi *pptmsi;
+ struct vm_pptdev_msix *pptmsix;
+ struct vm_nmi *vmnmi;
+ struct vm_stats *vmstats;
+ struct vm_stat_desc *statdesc;
+ struct vm_x2apic *x2apic;
+
+ sc = vmmdev_lookup2(cdev);
+ if (sc == NULL)
+ return (ENXIO);
+
+ vcpu = -1;
+ state_changed = 0;
+
+ /*
+ * Some VMM ioctls can operate only on vcpus that are not running.
+ */
+ switch (cmd) {
+ case VM_RUN:
+ case VM_SET_PINNING:
+ case VM_GET_REGISTER:
+ case VM_SET_REGISTER:
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ case VM_INJECT_EVENT:
+ case VM_GET_CAPABILITY:
+ case VM_SET_CAPABILITY:
+ case VM_PPTDEV_MSI:
+ case VM_PPTDEV_MSIX:
+ case VM_SET_X2APIC_STATE:
+ /*
+ * XXX fragile, handle with care
+ * Assumes that the first field of the ioctl data is the vcpu.
+ */
+ vcpu = *(int *)data;
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ error = EINVAL;
+ goto done;
+ }
+
+ if (cmd == VM_RUN)
+ new_state = VCPU_RUNNING;
+ else
+ new_state = VCPU_CANNOT_RUN;
+
+ error = vcpu_set_state(sc->vm, vcpu, new_state);
+ if (error)
+ goto done;
+
+ state_changed = 1;
+ break;
+
+ case VM_MAP_PPTDEV_MMIO:
+ case VM_BIND_PPTDEV:
+ case VM_UNBIND_PPTDEV:
+ case VM_MAP_MEMORY:
+ /*
+ * ioctls that operate on the entire virtual machine must
+ * prevent all vcpus from running.
+ */
+ error = 0;
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+ error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+ if (error)
+ break;
+ }
+
+ if (error) {
+ while (--vcpu >= 0)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ goto done;
+ }
+
+ state_changed = 2;
+ break;
+
+ default:
+ break;
+ }
+
+ switch(cmd) {
+ case VM_RUN:
+ vmrun = (struct vm_run *)data;
+ error = vm_run(sc->vm, vmrun);
+ break;
+ case VM_STAT_DESC: {
+ const char *desc;
+ statdesc = (struct vm_stat_desc *)data;
+ desc = vmm_stat_desc(statdesc->index);
+ if (desc != NULL) {
+ error = 0;
+ strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+ } else
+ error = EINVAL;
+ break;
+ }
+ case VM_STATS: {
+ CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+ vmstats = (struct vm_stats *)data;
+ getmicrotime(&vmstats->tv);
+ error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+ &vmstats->num_entries, vmstats->statbuf);
+ break;
+ }
+ case VM_PPTDEV_MSI:
+ pptmsi = (struct vm_pptdev_msi *)data;
+ error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+ pptmsi->bus, pptmsi->slot, pptmsi->func,
+ pptmsi->destcpu, pptmsi->vector,
+ pptmsi->numvec);
+ break;
+ case VM_PPTDEV_MSIX:
+ pptmsix = (struct vm_pptdev_msix *)data;
+ error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+ pptmsix->bus, pptmsix->slot,
+ pptmsix->func, pptmsix->idx,
+ pptmsix->msg, pptmsix->vector_control,
+ pptmsix->addr);
+ break;
+ case VM_MAP_PPTDEV_MMIO:
+ pptmmio = (struct vm_pptdev_mmio *)data;
+ error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+ pptmmio->func, pptmmio->gpa, pptmmio->len,
+ pptmmio->hpa);
+ break;
+ case VM_BIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_UNBIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_INJECT_EVENT:
+ vmevent = (struct vm_event *)data;
+ error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+ vmevent->vector,
+ vmevent->error_code,
+ vmevent->error_code_valid);
+ break;
+ case VM_INJECT_NMI:
+ vmnmi = (struct vm_nmi *)data;
+ error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+ break;
+ case VM_LAPIC_IRQ:
+ vmirq = (struct vm_lapic_irq *)data;
+ error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+ break;
+ case VM_SET_PINNING:
+ vmpin = (struct vm_pin *)data;
+ error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
+ vmpin->host_cpuid);
+ break;
+ case VM_GET_PINNING:
+ vmpin = (struct vm_pin *)data;
+ error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
+ &vmpin->host_cpuid);
+ break;
+ case VM_MAP_MEMORY:
+ seg = (struct vm_memory_segment *)data;
+ error = vm_malloc(sc->vm, seg->gpa, seg->len);
+ break;
+ case VM_GET_MEMORY_SEG:
+ seg = (struct vm_memory_segment *)data;
+ seg->len = 0;
+ (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+ error = 0;
+ break;
+ case VM_GET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ &vmreg->regval);
+ break;
+ case VM_SET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ vmreg->regval);
+ break;
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_get_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ &vmcap->capval);
+ break;
+ case VM_SET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_set_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ vmcap->capval);
+ break;
+ case VM_SET_X2APIC_STATE:
+ x2apic = (struct vm_x2apic *)data;
+ error = vm_set_x2apic_state(sc->vm,
+ x2apic->cpuid, x2apic->state);
+ break;
+ case VM_GET_X2APIC_STATE:
+ x2apic = (struct vm_x2apic *)data;
+ error = vm_get_x2apic_state(sc->vm,
+ x2apic->cpuid, &x2apic->state);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ if (state_changed == 1) {
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ } else if (state_changed == 2) {
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ }
+
+done:
+ return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+ int error;
+ struct vmmdev_softc *sc;
+
+ error = -1;
+ mtx_lock(&vmmdev_mtx);
+
+ sc = vmmdev_lookup2(cdev);
+ if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+ *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+ if (*paddr != (vm_paddr_t)-1)
+ error = 0;
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+
+ return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink)
+{
+
+ /*
+ * XXX must stop virtual machine instances that may be still
+ * running and cleanup their state.
+ */
+ if (sc->cdev)
+ destroy_dev(sc->cdev);
+
+ if (sc->vm)
+ vm_destroy(sc->vm);
+
+ if (unlink) {
+ mtx_lock(&vmmdev_mtx);
+ SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+ mtx_unlock(&vmmdev_mtx);
+ }
+
+ free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ char buf[VM_MAX_NAMELEN];
+ struct vmmdev_softc *sc;
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ /*
+ * XXX TODO if any process has this device open then fail
+ */
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup(buf);
+ if (sc == NULL) {
+ mtx_unlock(&vmmdev_mtx);
+ return (EINVAL);
+ }
+
+ sc->cdev->si_drv1 = NULL;
+ mtx_unlock(&vmmdev_mtx);
+
+ vmmdev_destroy(sc, TRUE);
+
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+ .d_name = "vmmdev",
+ .d_version = D_VERSION,
+ .d_ioctl = vmmdev_ioctl,
+ .d_mmap = vmmdev_mmap,
+ .d_read = vmmdev_rw,
+ .d_write = vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct vm *vm;
+ struct vmmdev_softc *sc, *sc2;
+ char buf[VM_MAX_NAMELEN];
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup(buf);
+ mtx_unlock(&vmmdev_mtx);
+ if (sc != NULL)
+ return (EEXIST);
+
+ vm = vm_create(buf);
+ if (vm == NULL)
+ return (EINVAL);
+
+ sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+ sc->vm = vm;
+
+ /*
+ * Lookup the name again just in case somebody sneaked in when we
+ * dropped the lock.
+ */
+ mtx_lock(&vmmdev_mtx);
+ sc2 = vmmdev_lookup(buf);
+ if (sc2 == NULL)
+ SLIST_INSERT_HEAD(&head, sc, link);
+ mtx_unlock(&vmmdev_mtx);
+
+ if (sc2 != NULL) {
+ vmmdev_destroy(sc, FALSE);
+ return (EEXIST);
+ }
+
+ sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ "vmm/%s", buf);
+ sc->cdev->si_drv1 = sc;
+
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+ mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+int
+vmmdev_cleanup(void)
+{
+ int error;
+
+ if (SLIST_EMPTY(&head))
+ error = 0;
+ else
+ error = EBUSY;
+
+ return (error);
+}
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
new file mode 100644
index 0000000..8dfef73
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+ vmm_host_efer = rdmsr(MSR_EFER);
+ vmm_host_pat = rdmsr(MSR_PAT);
+
+ /*
+ * We always want CR0.TS to be set when the processor does a VM exit.
+ *
+ * With emulation turned on unconditionally after a VM exit, we are
+ * able to trap inadvertent use of the FPU until the guest FPU state
+ * has been safely squirreled away.
+ */
+ vmm_host_cr0 = rcr0() | CR0_TS;
+
+ vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+ return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+ return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+ return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+ return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+ return (GSEL(GDATA_SEL, SEL_KPL));
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+ return (GSEL(GCODE_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+ return (GSEL(GPROC0_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+ return (0);
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+ return (r_idt.rd_base);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
new file mode 100644
index 0000000..839f54a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_HOST_H_
+#define _VMM_HOST_H_
+
+#ifndef _KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+ return ((uint64_t)PCPU_GET(tssp));
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+ return ((uint64_t)&gdt[NGDT * curcpu]);
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+ return ((uint64_t)&__pcpu[curcpu]);
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000..e73f6bb
--- /dev/null
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -0,0 +1,810 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif /* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+ [0x89] = {
+ .op_byte = 0x89,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8B] = {
+ .op_byte = 0x8B,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0xC7] = {
+ .op_byte = 0xC7,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x23] = {
+ .op_byte = 0x23,
+ .op_type = VIE_OP_TYPE_AND,
+ },
+ [0x81] = {
+ /* XXX Group 1 extended opcode - not just AND */
+ .op_byte = 0x81,
+ .op_type = VIE_OP_TYPE_AND,
+ .op_flags = VIE_OP_F_IMM,
+ }
+};
+
+/* struct vie.mod */
+#define VIE_MOD_INDIRECT 0
+#define VIE_MOD_INDIRECT_DISP8 1
+#define VIE_MOD_INDIRECT_DISP32 2
+#define VIE_MOD_DIRECT 3
+
+/* struct vie.rm */
+#define VIE_RM_SIB 4
+#define VIE_RM_DISP32 5
+
+#define GB (1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15
+};
+
+static uint64_t size2mask[] = {
+ [1] = 0xff,
+ [2] = 0xffff,
+ [4] = 0xffffffff,
+ [8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+ /*
+ * XXX
+ * The operand register in which we store the result of the
+ * read must be a GPR that we can modify even if the vcpu
+ * is "running". All the GPRs qualify except for %rsp.
+ *
+ * This is a limitation of the vm_set_register() API
+ * and can be fixed if necessary.
+ */
+ if (reg == VM_REG_GUEST_RSP)
+ return (0);
+#endif
+ return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+ int error;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ error = vm_get_register(vm, vcpuid, reg, rval);
+
+ return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size)
+{
+ int error;
+ uint64_t origval;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ switch (size) {
+ case 1:
+ case 2:
+ error = vie_read_register(vm, vcpuid, reg, &origval);
+ if (error)
+ return (error);
+ val &= size2mask[size];
+ val |= origval & ~size2mask[size];
+ break;
+ case 4:
+ val &= 0xffffffffUL;
+ break;
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ error = vm_set_register(vm, vcpuid, reg, val);
+ return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ * - default address size is 64-bits
+ * - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x89:
+ /*
+ * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m32, r32
+ * REX.W + 89/r mov r/m64, r64
+ */
+ if (vie->rex_w)
+ size = 8;
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0x8B:
+ /*
+ * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8B/r: mov r32, r/m32
+ * REX.W 8B/r: mov r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = gpr_map[vie->reg];
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xC7:
+ /*
+ * MOV from imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m32, imm32
+ * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
+ */
+ val = vie->immediate; /* already sign-extended */
+
+ if (vie->rex_w)
+ size = 8;
+
+ if (size != 8)
+ val &= size2mask[size];
+
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ break;
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val1, val2;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x23:
+ /*
+ * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+ * result in reg.
+ *
+ * 23/r and r32, r/m32
+ * REX.W + 23/r and r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ val1 &= val2;
+ error = vie_update_register(vm, vcpuid, reg, val1, size);
+ break;
+ case 0x81:
+ /*
+ * AND reg (ModRM:reg) with immediate and store the
+ * result in reg
+ *
+ * 81/ and r/m32, imm32
+ * REX.W + 81/ and r/m64, imm32 sign-extended to 64
+ *
+ * Currently, only the AND operation of the 0x81 opcode
+ * is implemented (ModRM:reg = b100).
+ */
+ if ((vie->reg & 7) != 4)
+ break;
+
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &val1, size, arg);
+ if (error)
+ break;
+
+ /*
+ * perform the operation with the pre-fetched immediate
+ * operand and write the result
+ */
+ val1 &= vie->immediate;
+ error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite,
+ void *memarg)
+{
+ int error;
+
+ if (!vie->decoded)
+ return (EINVAL);
+
+ switch (vie->op.op_type) {
+ case VIE_OP_TYPE_MOV:
+ error = emulate_mov(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_AND:
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef _KERNEL
+static void
+vie_init(struct vie *vie)
+{
+
+ bzero(vie, sizeof(struct vie));
+
+ vie->base_register = VM_REG_LAST;
+ vie->index_register = VM_REG_LAST;
+}
+
+static int
+gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
+ uint64_t *gpa, uint64_t *gpaend)
+{
+ vm_paddr_t hpa;
+ int nlevels, ptpshift, ptpindex;
+ uint64_t *ptpbase, pte, pgsize;
+
+ /*
+ * XXX assumes 64-bit guest with 4 page walk levels
+ */
+ nlevels = 4;
+ while (--nlevels >= 0) {
+ /* Zero out the lower 12 bits and the upper 12 bits */
+ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+ hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
+ if (hpa == -1)
+ goto error;
+
+ ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
+
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gla >> ptpshift) & 0x1FF;
+ pgsize = 1UL << ptpshift;
+
+ pte = ptpbase[ptpindex];
+
+ if ((pte & PG_V) == 0)
+ goto error;
+
+ if (pte & PG_PS) {
+ if (pgsize > 1 * GB)
+ goto error;
+ else
+ break;
+ }
+
+ ptpphys = pte;
+ }
+
+ /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+ *gpa = pte | (gla & (pgsize - 1));
+ *gpaend = pte + pgsize;
+ return (0);
+
+error:
+ return (-1);
+}
+
+int
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
+ uint64_t cr3, struct vie *vie)
+{
+ int n, err;
+ uint64_t hpa, gpa, gpaend, off;
+
+ /*
+ * XXX cache previously fetched instructions using 'rip' as the tag
+ */
+
+ if (inst_length > VIE_INST_SIZE)
+ panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
+ vie_init(vie);
+
+ /* Copy the instruction into 'vie' */
+ while (vie->num_valid < inst_length) {
+ err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
+ if (err)
+ break;
+
+ off = gpa & PAGE_MASK;
+ n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
+
+ hpa = vm_gpa2hpa(vm, gpa, n);
+ if (hpa == -1)
+ break;
+
+ bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+
+ rip += n;
+ vie->num_valid += n;
+ }
+
+ if (vie->num_valid == inst_length)
+ return (0);
+ else
+ return (-1);
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+
+ if (vie->num_processed < vie->num_valid) {
+ *x = vie->inst[vie->num_processed];
+ return (0);
+ } else
+ return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+
+ vie->num_processed++;
+}
+
+static int
+decode_rex(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ if (x >= 0x40 && x <= 0x4F) {
+ vie->rex_w = x & 0x8 ? 1 : 0;
+ vie->rex_r = x & 0x4 ? 1 : 0;
+ vie->rex_x = x & 0x2 ? 1 : 0;
+ vie->rex_b = x & 0x1 ? 1 : 0;
+
+ vie_advance(vie);
+ }
+
+ return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->op = one_byte_opcodes[x];
+
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
+ return (-1);
+
+ vie_advance(vie);
+ return (0);
+}
+
+/*
+ * XXX assuming 32-bit or 64-bit guest
+ */
+static int
+decode_modrm(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->mod = (x >> 6) & 0x3;
+ vie->rm = (x >> 0) & 0x7;
+ vie->reg = (x >> 3) & 0x7;
+
+ /*
+ * A direct addressing mode makes no sense in the context of an EPT
+ * fault. There has to be a memory access involved to cause the
+ * EPT fault.
+ */
+ if (vie->mod == VIE_MOD_DIRECT)
+ return (-1);
+
+ if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+ (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+ /*
+ * Table 2-5: Special Cases of REX Encodings
+ *
+ * mod=0, r/m=5 is used in the compatibility mode to
+ * indicate a disp32 without a base register.
+ *
+ * mod!=3, r/m=4 is used in the compatibility mode to
+ * indicate that the SIB byte is present.
+ *
+ * The 'b' bit in the REX prefix is don't care in
+ * this case.
+ */
+ } else {
+ vie->rm |= (vie->rex_b << 3);
+ }
+
+ vie->reg |= (vie->rex_r << 3);
+
+ /* SIB */
+ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+ goto done;
+
+ vie->base_register = gpr_map[vie->rm];
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ case VIE_MOD_INDIRECT:
+ if (vie->rm == VIE_RM_DISP32) {
+ vie->disp_bytes = 4;
+ vie->base_register = VM_REG_LAST; /* no base */
+ }
+ break;
+ }
+
+ /* Figure out immediate operand size (if any) */
+ if (vie->op.op_flags & VIE_OP_F_IMM)
+ vie->imm_bytes = 4;
+ else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ vie->imm_bytes = 1;
+
+done:
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+ uint8_t x;
+
+ /* Proceed only if SIB byte is present */
+ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+ return (0);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ /* De-construct the SIB byte */
+ vie->ss = (x >> 6) & 0x3;
+ vie->index = (x >> 3) & 0x7;
+ vie->base = (x >> 0) & 0x7;
+
+ /* Apply the REX prefix modifiers */
+ vie->index |= vie->rex_x << 3;
+ vie->base |= vie->rex_b << 3;
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ }
+
+ if (vie->mod == VIE_MOD_INDIRECT &&
+ (vie->base == 5 || vie->base == 13)) {
+ /*
+ * Special case when base register is unused if mod = 0
+ * and base = %rbp or %r13.
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ vie->disp_bytes = 4;
+ } else {
+ vie->base_register = gpr_map[vie->base];
+ }
+
+ /*
+ * All encodings of 'index' are valid except for %rsp (4).
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ if (vie->index != 4)
+ vie->index_register = gpr_map[vie->index];
+
+ /* 'scale' makes sense only in the context of an index register */
+ if (vie->index_register < VM_REG_LAST)
+ vie->scale = 1 << vie->ss;
+
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+ int n, i;
+ uint8_t x;
+
+ union {
+ char buf[4];
+ int8_t signed8;
+ int32_t signed32;
+ } u;
+
+ if ((n = vie->disp_bytes) == 0)
+ return (0);
+
+ if (n != 1 && n != 4)
+ panic("decode_displacement: invalid disp_bytes %d", n);
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ if (n == 1)
+ vie->displacement = u.signed8; /* sign-extended */
+ else
+ vie->displacement = u.signed32; /* sign-extended */
+
+ return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+ int i, n;
+ uint8_t x;
+ union {
+ char buf[4];
+ int8_t signed8;
+ int32_t signed32;
+ } u;
+
+ if ((n = vie->imm_bytes) == 0)
+ return (0);
+
+ if (n != 1 && n != 4)
+ panic("decode_immediate: invalid imm_bytes %d", n);
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ if (n == 1)
+ vie->immediate = u.signed8; /* sign-extended */
+ else
+ vie->immediate = u.signed32; /* sign-extended */
+
+ return (0);
+}
+
+#define VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+ int error;
+ uint64_t base, idx;
+
+ base = 0;
+ if (vie->base_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->base_register, &base);
+ if (error) {
+ printf("verify_gla: error %d getting base reg %d\n",
+ error, vie->base_register);
+ return (-1);
+ }
+ }
+
+ idx = 0;
+ if (vie->index_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+ if (error) {
+ printf("verify_gla: error %d getting index reg %d\n",
+ error, vie->index_register);
+ return (-1);
+ }
+ }
+
+ if (base + vie->scale * idx + vie->displacement != gla) {
+ printf("verify_gla mismatch: "
+ "base(0x%0lx), scale(%d), index(0x%0lx), "
+ "disp(0x%0lx), gla(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla);
+ return (-1);
+ }
+
+ return (0);
+}
+#endif /* VERIFY_GLA */
+
+int
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+
+ if (decode_rex(vie))
+ return (-1);
+
+ if (decode_opcode(vie))
+ return (-1);
+
+ if (decode_modrm(vie))
+ return (-1);
+
+ if (decode_sib(vie))
+ return (-1);
+
+ if (decode_displacement(vie))
+ return (-1);
+
+ if (decode_immediate(vie))
+ return (-1);
+
+#ifdef VERIFY_GLA
+ if (verify_gla(vm, cpuid, gla, vie))
+ return (-1);
+#endif
+
+ vie->decoded = 1; /* success */
+
+ return (0);
+}
+#endif /* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 0000000..643d326
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+int vmm_ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+ int idx;
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ /*
+ * Search backwards from the highest IDT vector available for use
+ * as our IPI vector. We install the 'justreturn' handler at that
+ * vector and use it to interrupt the vcpus.
+ *
+ * We do this because the IPI_AST is heavyweight and saves all
+ * registers in the trapframe. This is overkill for our use case
+ * which is simply to EOI the interrupt and return.
+ */
+ idx = APIC_SPURIOUS_INT;
+ while (--idx >= APIC_IPI_INTS) {
+ ip = &idt[idx];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func == (uintptr_t)&IDTVEC(rsvd)) {
+ vmm_ipinum = idx;
+ setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+ SEL_KPL, 0);
+ break;
+ }
+ }
+
+ if (vmm_ipinum != IPI_AST && bootverbose) {
+ printf("vmm_ipi_init: installing ipi handler to interrupt "
+ "vcpus at vector %d\n", vmm_ipinum);
+ }
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+ if (vmm_ipinum != IPI_AST)
+ setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 0000000..91552e3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+extern int vmm_ipinum;
+
+void vmm_ipi_init(void);
+void vmm_ipi_cleanup(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 0000000..e691c61
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define _VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define KTR_VMM KTR_GEN
+
+#define VMM_CTR0(vm, vcpuid, format) \
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define VMM_CTR1(vm, vcpuid, format, p1) \
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1))
+
+#define VMM_CTR2(vm, vcpuid, format, p1, p2) \
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2))
+
+#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 0000000..d024b71
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ if (cpu < 0 || cpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (vector < 32 || vector > 255)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ vlapic_set_intr_ready(vlapic, vector);
+
+ vm_interrupt_hostcpu(vm, cpu);
+
+ return (0);
+}
+
+int
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ return (vlapic_timer_tick(vlapic));
+}
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+ if (msr >= 0x800 && msr <= 0xBFF)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+ return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+ if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ *rval = vlapic_get_apicbase(vlapic);
+ error = 0;
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_op_mem_read(vlapic, offset, DWORD, rval);
+ }
+
+ return (error);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ vlapic_set_apicbase(vlapic, val);
+ error = 0;
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_op_mem_write(vlapic, offset, DWORD, val);
+ }
+
+ return (error);
+}
+
+int
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+ return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+ return (error);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 0000000..a79912e
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define _VMM_LAPIC_H_
+
+struct vm;
+
+boolean_t lapic_msr(u_int num);
+int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
+int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
+
+int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+ uint64_t *rval, int size, void *arg);
+int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+ uint64_t wval, int size, void *arg);
+
+int lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 0000000..04f99b1
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,135 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+SYSCTL_DECL(_hw_vmm);
+
+static u_long pages_allocated;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
+ &pages_allocated, 0, "4KB pages allocated");
+
+static void
+update_pages_allocated(int howmany)
+{
+ pages_allocated += howmany; /* XXX locking? */
+}
+
+int
+vmm_mem_init(void)
+{
+
+ return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+ int flags;
+ vm_page_t m;
+ vm_paddr_t pa;
+
+ if (size != PAGE_SIZE)
+ panic("vmm_mem_alloc: invalid allocation size %lu", size);
+
+ flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
+ VM_ALLOC_ZERO;
+
+ while (1) {
+ /*
+ * XXX need policy to determine when to back off the allocation
+ */
+ m = vm_page_alloc(NULL, 0, flags);
+ if (m == NULL)
+ VM_WAIT;
+ else
+ break;
+ }
+
+ pa = VM_PAGE_TO_PHYS(m);
+
+ if ((m->flags & PG_ZERO) == 0)
+ pagezero((void *)PHYS_TO_DMAP(pa));
+ m->valid = VM_PAGE_BITS_ALL;
+
+ update_pages_allocated(1);
+
+ return (pa);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+ vm_page_t m;
+
+ if (base & PAGE_MASK) {
+ panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+ "0x%0x boundary\n", base, PAGE_SIZE);
+ }
+
+ if (length != PAGE_SIZE)
+ panic("vmm_mem_free: invalid length %lu", length);
+
+ m = PHYS_TO_VM_PAGE(base);
+ m->wire_count--;
+ vm_page_free(m);
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+
+ update_pages_allocated(-1);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+ return (ptoa(Maxmem));
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 0000000..7d45c74
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MEM_H_
+#define _VMM_MEM_H_
+
+int vmm_mem_init(void);
+vm_paddr_t vmm_mem_alloc(size_t size);
+void vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t vmm_mem_maxaddr(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 0000000..d97c819
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,254 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define VMM_MSR_F_EMULATE 0x01
+#define VMM_MSR_F_READONLY 0x02
+#define VMM_MSR_F_INVALID 0x04 /* guest_msr_valid() can override this */
+
+struct vmm_msr {
+ int num;
+ int flags;
+ uint64_t hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+ { MSR_LSTAR, 0 },
+ { MSR_CSTAR, 0 },
+ { MSR_STAR, 0 },
+ { MSR_SF_MASK, 0 },
+ { MSR_PAT, VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
+ { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+ { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define readonly_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define emulated_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+#define invalid_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
+
+void
+vmm_msr_init(void)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ /*
+ * XXX this assumes that the value of the host msr does not
+ * change after we have cached it.
+ */
+ vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+ }
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ switch (vmm_msr[i].num) {
+ case MSR_LSTAR:
+ case MSR_CSTAR:
+ case MSR_STAR:
+ case MSR_SF_MASK:
+ case MSR_BIOS_SIGN:
+ case MSR_MCG_CAP:
+ guest_msrs[i] = 0;
+ break;
+ case MSR_PAT:
+ guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ break;
+ default:
+ panic("guest_msrs_init: missing initialization for msr "
+ "0x%0x", vmm_msr[i].num);
+ }
+ }
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++)
+ if (vmm_msr[i].num == num)
+ return (i);
+
+ return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+ int idx;
+ uint64_t *guest_msrs;
+
+ if (lapic_msr(num))
+ return (lapic_wrmsr(vm, cpu, num, val));
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0 || invalid_msr(idx))
+ return (EINVAL);
+
+ if (!readonly_msr(idx)) {
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ /* Stash the value */
+ guest_msrs[idx] = val;
+
+ /* Update processor state for non-emulated MSRs */
+ if (!emulated_msr(idx))
+ wrmsr(vmm_msr[idx].num, val);
+ }
+
+ return (0);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+ int error, idx;
+ uint32_t eax, edx;
+ uint64_t result, *guest_msrs;
+
+ if (lapic_msr(num)) {
+ error = lapic_rdmsr(vm, cpu, num, &result);
+ goto done;
+ }
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0 || invalid_msr(idx)) {
+ error = EINVAL;
+ goto done;
+ }
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+ result = guest_msrs[idx];
+
+ /*
+ * If this is not an emulated msr register make sure that the processor
+ * state matches our cached state.
+ */
+ if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+ panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+ "(0x%016lx) and actual (0x%016lx) values", num,
+ result, rdmsr(num));
+ }
+
+ error = 0;
+
+done:
+ if (error == 0) {
+ eax = result;
+ edx = result >> 32;
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+ if (error)
+ panic("vm_set_register(rax) error %d", error);
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+ if (error)
+ panic("vm_set_register(rdx) error %d", error);
+ }
+ return (error);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, guest_msrs[i]);
+ }
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+ }
+}
+
+/*
+ * Must be called by the CPU-specific code before any guests are
+ * created
+ */
+void
+guest_msr_valid(int msr)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (vmm_msr[i].num == msr && invalid_msr(i)) {
+ vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
+ }
+ }
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 0000000..8a1fda3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MSR_H_
+#define _VMM_MSR_H_
+
+#define VMM_MSR_NUM 16
+struct vm;
+
+void vmm_msr_init(void);
+int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void guest_msrs_init(struct vm *vm, int cpu);
+void guest_msr_valid(int msr);
+void restore_host_msrs(struct vm *vm, int cpu);
+void restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 0000000..ae60979
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+ struct vmm_stat_type *vst = arg;
+
+ /* We require all stats to identify themselves with a description */
+ if (vst->desc == NULL)
+ return;
+
+ if (vstnum >= MAX_VMM_STAT_TYPES) {
+ printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+ return;
+ }
+
+ vst->index = vstnum;
+ vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+ int i;
+ uint64_t *stats;
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ stats = vcpu_stats(vm, vcpu);
+ for (i = 0; i < vstnum; i++)
+ buf[i] = stats[i];
+ *num_stats = vstnum;
+ return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+ u_long size;
+
+ size = vstnum * sizeof(uint64_t);
+
+ return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+ free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+ if (index >= 0 && index < vstnum)
+ return (vsttab[index]->desc);
+ else
+ return (NULL);
+}
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 0000000..7c075a6
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define _VMM_STAT_H_
+
+struct vm;
+
+#define MAX_VMM_STAT_TYPES 64 /* arbitrary */
+
+struct vmm_stat_type {
+ const char *desc; /* description of statistic */
+ int index; /* position in the stats buffer */
+};
+
+void vmm_stat_init(void *arg);
+
+#define VMM_STAT_DEFINE(type, desc) \
+ struct vmm_stat_type type[1] = { \
+ { desc, -1 } \
+ }; \
+ SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+void *vmm_stat_alloc(void);
+void vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef VMM_KEEP_STATS
+ uint64_t *stats = vcpu_stats(vm, vcpu);
+ if (vst->index >= 0)
+ stats[vst->index] += x;
+#endif
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 0000000..2afc608
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define LOCORE
+
+#include <machine/asmacros.h>
+
+#define LA_EOI 0xB0
+
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(justreturn)
+ pushq %rax
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax)
+ popq %rax
+ iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 0000000..f245f92
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+ if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+ unsigned int regs[4];
+
+ /*
+ * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+ *
+ * Both Intel and AMD support this bit.
+ */
+ if (cpu_exthigh >= 0x80000001) {
+ do_cpuid(0x80000001, regs);
+ if (regs[3] & (1 << 26))
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+ DUMP_REG(rdi);
+ DUMP_REG(rsi);
+ DUMP_REG(rdx);
+ DUMP_REG(rcx);
+ DUMP_REG(r8);
+ DUMP_REG(r9);
+ DUMP_REG(rax);
+ DUMP_REG(rbx);
+ DUMP_REG(rbp);
+ DUMP_REG(r10);
+ DUMP_REG(r11);
+ DUMP_REG(r12);
+ DUMP_REG(r13);
+ DUMP_REG(r14);
+ DUMP_REG(r15);
+ DUMP_REG(trapno);
+ DUMP_REG(addr);
+ DUMP_REG(flags);
+ DUMP_REG(err);
+ DUMP_REG(rip);
+ DUMP_REG(rflags);
+ DUMP_REG(rsp);
+ DUMP_SEG(cs);
+ DUMP_SEG(ss);
+ DUMP_SEG(fs);
+ DUMP_SEG(gs);
+ DUMP_SEG(es);
+ DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 0000000..7f82332
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define _VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t vmm_is_intel(void);
+boolean_t vmm_is_amd(void);
+boolean_t vmm_supports_1G_pages(void);
+
+void dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 0000000..94abe09
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+
+#include "x86.h"
+
+#define CPUID_VM_HIGH 0x40000000
+
+static const char bhyve_id[12] = "BHyVE BHyVE ";
+
+int
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+ int error;
+ unsigned int func, regs[4];
+ enum x2apic_state x2apic_state;
+
+ func = *eax;
+
+ /*
+ * Requests for invalid CPUID levels should map to the highest
+ * available level instead.
+ */
+ if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+ if (*eax > cpu_exthigh)
+ *eax = cpu_exthigh;
+ } else if (*eax >= 0x40000000) {
+ if (*eax > CPUID_VM_HIGH)
+ *eax = CPUID_VM_HIGH;
+ } else if (*eax > cpu_high) {
+ *eax = cpu_high;
+ }
+
+ /*
+ * In general the approach used for CPU topology is to
+ * advertise a flat topology where all CPUs are packages with
+ * no multi-core or SMT.
+ */
+ switch (func) {
+ case CPUID_0000_0000:
+ case CPUID_0000_0002:
+ case CPUID_0000_0003:
+ case CPUID_0000_000A:
+ cpuid_count(*eax, *ecx, regs);
+ break;
+
+ case CPUID_8000_0000:
+ case CPUID_8000_0001:
+ case CPUID_8000_0002:
+ case CPUID_8000_0003:
+ case CPUID_8000_0004:
+ case CPUID_8000_0006:
+ case CPUID_8000_0007:
+ case CPUID_8000_0008:
+ cpuid_count(*eax, *ecx, regs);
+ break;
+
+ case CPUID_0000_0001:
+ do_cpuid(1, regs);
+
+ error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+ if (error) {
+ panic("x86_emulate_cpuid: error %d "
+ "fetching x2apic state", error);
+ }
+
+ /*
+ * Override the APIC ID only in ebx
+ */
+ regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+ regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
+
+ /*
+ * Don't expose VMX, SpeedStep or TME capability.
+ * Advertise x2APIC capability and Hypervisor guest.
+ */
+ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+
+ regs[2] |= CPUID2_HV;
+
+ if (x2apic_state != X2APIC_DISABLED)
+ regs[2] |= CPUID2_X2APIC;
+
+ /*
+ * Hide xsave/osxsave/avx until the FPU save/restore
+ * issues are resolved
+ */
+ regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+ CPUID2_AVX);
+
+ /*
+ * Hide monitor/mwait until we know how to deal with
+ * these instructions.
+ */
+ regs[2] &= ~CPUID2_MON;
+
+ /*
+ * Hide thermal monitoring
+ */
+ regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+
+ /*
+ * Machine check handling is done in the host.
+ * Hide MTRR capability.
+ */
+ regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+ /*
+ * Disable multi-core.
+ */
+ regs[1] &= ~CPUID_HTT_CORES;
+ regs[3] &= ~CPUID_HTT;
+ break;
+
+ case CPUID_0000_0004:
+ do_cpuid(4, regs);
+
+ /*
+ * Do not expose topology.
+ */
+ regs[0] &= 0xffff8000;
+ regs[0] |= 0x04008000;
+ break;
+
+ case CPUID_0000_0006:
+ case CPUID_0000_0007:
+ /*
+ * Handle the access, but report 0 for
+ * all options
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = 0;
+ regs[3] = 0;
+ break;
+
+ case CPUID_0000_000B:
+ /*
+ * Processor topology enumeration
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = *ecx & 0xff;
+ regs[3] = vcpu_id;
+ break;
+
+ case 0x40000000:
+ regs[0] = CPUID_VM_HIGH;
+ bcopy(bhyve_id, &regs[1], 4);
+ bcopy(bhyve_id, &regs[2], 4);
+ bcopy(bhyve_id, &regs[3], 4);
+ break;
+ default:
+ /* XXX: Leaf 5? */
+ return (0);
+ }
+
+ *eax = regs[0];
+ *ebx = regs[1];
+ *ecx = regs[2];
+ *edx = regs[3];
+ return (1);
+}
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 0000000..368e967
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define _X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001 (0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
+#define CPUID_0000_000A (0xA)
+#define CPUID_0000_000B (0xB)
+#define CPUID_8000_0000 (0x80000000)
+#define CPUID_8000_0001 (0x80000001)
+#define CPUID_8000_0002 (0x80000002)
+#define CPUID_8000_0003 (0x80000003)
+#define CPUID_8000_0004 (0x80000004)
+#define CPUID_8000_0006 (0x80000006)
+#define CPUID_8000_0007 (0x80000007)
+#define CPUID_8000_0008 (0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK (0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT 24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX (1<<5)
+
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx);
+
+#endif
OpenPOWER on IntegriCloud