summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2013-01-19 04:18:52 +0000
committerneel <neel@FreeBSD.org>2013-01-19 04:18:52 +0000
commit363335d53e3c955602378aa434d0054c48a6e0d6 (patch)
tree5af6fe77acd4da3002c907484fd64133deb95c8d
parent3600c83b820e00959d61600e67e9dcb32ef6b518 (diff)
parentdde8bf641fc7c8e9541167cd7c01523973d0b569 (diff)
downloadFreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.zip
FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.tar.gz
Merge projects/bhyve to head.
'bhyve' was developed by grehan@ and myself at NetApp (thanks!). Special thanks to Peter Snyder, Joe Caradonna and Michael Dexter for their support and encouragement. Obtained from: NetApp
-rw-r--r--lib/Makefile5
-rw-r--r--lib/libvmmapi/Makefile11
-rw-r--r--lib/libvmmapi/vmmapi.c723
-rw-r--r--lib/libvmmapi/vmmapi.h105
-rw-r--r--lib/libvmmapi/vmmapi_freebsd.c183
-rw-r--r--share/man/man4/bhyve.468
-rw-r--r--share/mk/bsd.libnames.mk1
-rw-r--r--sys/amd64/include/vmm.h293
-rw-r--r--sys/amd64/include/vmm_dev.h215
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h113
-rw-r--r--sys/amd64/vmm/amd/amdv.c265
-rw-r--r--sys/amd64/vmm/intel/ept.c392
-rw-r--r--sys/amd64/vmm/intel/ept.h43
-rw-r--r--sys/amd64/vmm/intel/vmcs.c551
-rw-r--r--sys/amd64/vmm/intel/vmcs.h338
-rw-r--r--sys/amd64/vmm/intel/vmx.c1845
-rw-r--r--sys/amd64/vmm/intel/vmx.h120
-rw-r--r--sys/amd64/vmm/intel/vmx_controls.h92
-rw-r--r--sys/amd64/vmm/intel/vmx_cpufunc.h218
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c89
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.c172
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.h78
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S246
-rw-r--r--sys/amd64/vmm/intel/vtd.c677
-rw-r--r--sys/amd64/vmm/io/iommu.c277
-rw-r--r--sys/amd64/vmm/io/iommu.h75
-rw-r--r--sys/amd64/vmm/io/ppt.c610
-rw-r--r--sys/amd64/vmm/io/ppt.h41
-rw-r--r--sys/amd64/vmm/io/vdev.c270
-rw-r--r--sys/amd64/vmm/io/vdev.h84
-rw-r--r--sys/amd64/vmm/io/vlapic.c901
-rw-r--r--sys/amd64/vmm/io/vlapic.h111
-rw-r--r--sys/amd64/vmm/vmm.c1022
-rw-r--r--sys/amd64/vmm/vmm_dev.c538
-rw-r--r--sys/amd64/vmm/vmm_host.c124
-rw-r--r--sys/amd64/vmm/vmm_host.h75
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c810
-rw-r--r--sys/amd64/vmm/vmm_ipi.c93
-rw-r--r--sys/amd64/vmm/vmm_ipi.h39
-rw-r--r--sys/amd64/vmm/vmm_ktr.h51
-rw-r--r--sys/amd64/vmm/vmm_lapic.c201
-rw-r--r--sys/amd64/vmm/vmm_lapic.h71
-rw-r--r--sys/amd64/vmm/vmm_mem.c135
-rw-r--r--sys/amd64/vmm/vmm_mem.h37
-rw-r--r--sys/amd64/vmm/vmm_msr.c254
-rw-r--r--sys/amd64/vmm/vmm_msr.h43
-rw-r--r--sys/amd64/vmm/vmm_stat.c104
-rw-r--r--sys/amd64/vmm/vmm_stat.h71
-rw-r--r--sys/amd64/vmm/vmm_support.S42
-rw-r--r--sys/amd64/vmm/vmm_util.c111
-rw-r--r--sys/amd64/vmm/vmm_util.h40
-rw-r--r--sys/amd64/vmm/x86.c202
-rw-r--r--sys/amd64/vmm/x86.h64
-rw-r--r--sys/conf/files.amd645
-rw-r--r--sys/dev/blackhole/blackhole.c129
-rw-r--r--sys/dev/bvm/bvm_console.c240
-rw-r--r--sys/dev/bvm/bvm_dbg.c100
-rw-r--r--sys/modules/Makefile4
-rw-r--r--sys/modules/blackhole/Makefile9
-rw-r--r--sys/modules/vmm/Makefile62
-rw-r--r--usr.sbin/Makefile.amd643
-rw-r--r--usr.sbin/bhyve/Makefile27
-rw-r--r--usr.sbin/bhyve/acpi.c844
-rw-r--r--usr.sbin/bhyve/acpi.h34
-rw-r--r--usr.sbin/bhyve/atpic.c68
-rw-r--r--usr.sbin/bhyve/bhyverun.c788
-rw-r--r--usr.sbin/bhyve/bhyverun.h53
-rw-r--r--usr.sbin/bhyve/consport.c140
-rw-r--r--usr.sbin/bhyve/dbgport.c138
-rw-r--r--usr.sbin/bhyve/dbgport.h36
-rw-r--r--usr.sbin/bhyve/elcr.c65
-rw-r--r--usr.sbin/bhyve/inout.c151
-rw-r--r--usr.sbin/bhyve/inout.h67
-rw-r--r--usr.sbin/bhyve/ioapic.c324
-rw-r--r--usr.sbin/bhyve/ioapic.h38
-rw-r--r--usr.sbin/bhyve/mem.c218
-rw-r--r--usr.sbin/bhyve/mem.h57
-rw-r--r--usr.sbin/bhyve/mevent.c432
-rw-r--r--usr.sbin/bhyve/mevent.h49
-rw-r--r--usr.sbin/bhyve/mevent_test.c180
-rw-r--r--usr.sbin/bhyve/mptbl.c398
-rw-r--r--usr.sbin/bhyve/mptbl.h35
-rw-r--r--usr.sbin/bhyve/pci_emul.c1117
-rw-r--r--usr.sbin/bhyve/pci_emul.h216
-rw-r--r--usr.sbin/bhyve/pci_hostbridge.c52
-rw-r--r--usr.sbin/bhyve/pci_passthru.c724
-rw-r--r--usr.sbin/bhyve/pci_uart.c626
-rw-r--r--usr.sbin/bhyve/pci_virtio_block.c534
-rw-r--r--usr.sbin/bhyve/pci_virtio_net.c781
-rw-r--r--usr.sbin/bhyve/pit_8254.c198
-rw-r--r--usr.sbin/bhyve/pit_8254.h45
-rw-r--r--usr.sbin/bhyve/pmtmr.c108
-rw-r--r--usr.sbin/bhyve/post.c51
-rw-r--r--usr.sbin/bhyve/rtc.c274
-rw-r--r--usr.sbin/bhyve/spinup_ap.c119
-rw-r--r--usr.sbin/bhyve/spinup_ap.h34
-rw-r--r--usr.sbin/bhyve/uart.c60
-rw-r--r--usr.sbin/bhyve/virtio.h85
-rw-r--r--usr.sbin/bhyve/xmsr.c48
-rw-r--r--usr.sbin/bhyve/xmsr.h34
-rw-r--r--usr.sbin/bhyvectl/Makefile17
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.c1524
-rw-r--r--usr.sbin/bhyveload/Makefile14
-rw-r--r--usr.sbin/bhyveload/bhyveload.8130
-rw-r--r--usr.sbin/bhyveload/bhyveload.c652
105 files changed, 25476 insertions, 0 deletions
diff --git a/lib/Makefile b/lib/Makefile
index 3dd274e..132302e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -115,6 +115,7 @@ SUBDIR= ${SUBDIR_ORDERED} \
${_libusbhid} \
${_libusb} \
${_libvgl} \
+ ${_libvmmapi} \
libwrap \
liby \
libz \
@@ -198,6 +199,10 @@ _libproc= libproc
_librtld_db= librtld_db
.endif
+.if ${MACHINE_CPUARCH} == "amd64"
+_libvmmapi= libvmmapi
+.endif
+
.if ${MACHINE_CPUARCH} == "ia64"
_libefi= libefi
.endif
diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile
new file mode 100644
index 0000000..93d3c85
--- /dev/null
+++ b/lib/libvmmapi/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+LIB= vmmapi
+SRCS= vmmapi.c vmmapi_freebsd.c
+INCS= vmmapi.h
+
+WARNS?= 2
+
+CFLAGS+= -I${.CURDIR}
+
+.include <bsd.lib.mk>
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
new file mode 100644
index 0000000..cfb42d0
--- /dev/null
+++ b/lib/libvmmapi/vmmapi.c
@@ -0,0 +1,723 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <machine/specialreg.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmmapi.h"
+
+struct vmctx {
+ int fd;
+ char *name;
+};
+
+#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
+#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
+
+static int
+vm_device_open(const char *name)
+{
+ int fd, len;
+ char *vmfile;
+
+ len = strlen("/dev/vmm/") + strlen(name) + 1;
+ vmfile = malloc(len);
+ assert(vmfile != NULL);
+ snprintf(vmfile, len, "/dev/vmm/%s", name);
+
+ /* Open the device file */
+ fd = open(vmfile, O_RDWR, 0);
+
+ free(vmfile);
+ return (fd);
+}
+
+int
+vm_create(const char *name)
+{
+
+ return (CREATE((char *)name));
+}
+
+struct vmctx *
+vm_open(const char *name)
+{
+ struct vmctx *vm;
+
+ vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
+ assert(vm != NULL);
+
+ vm->fd = -1;
+ vm->name = (char *)(vm + 1);
+ strcpy(vm->name, name);
+
+ if ((vm->fd = vm_device_open(vm->name)) < 0)
+ goto err;
+
+ return (vm);
+err:
+ vm_destroy(vm);
+ return (NULL);
+}
+
+void
+vm_destroy(struct vmctx *vm)
+{
+ assert(vm != NULL);
+
+ if (vm->fd >= 0)
+ close(vm->fd);
+ DESTROY(vm->name);
+
+ free(vm);
+}
+
+size_t
+vmm_get_mem_total(void)
+{
+ size_t mem_total = 0;
+ size_t oldlen = sizeof(mem_total);
+ int error;
+ error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0);
+ if (error)
+ return -1;
+ return mem_total;
+}
+
+size_t
+vmm_get_mem_free(void)
+{
+ size_t mem_free = 0;
+ size_t oldlen = sizeof(mem_free);
+ int error;
+ error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0);
+ if (error)
+ return -1;
+ return mem_free;
+}
+
+int
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
+{
+ int error;
+ struct vm_memory_segment seg;
+
+ bzero(&seg, sizeof(seg));
+ seg.gpa = gpa;
+ error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
+ *ret_len = seg.len;
+ return (error);
+}
+
+int
+vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr)
+{
+ int error;
+ struct vm_memory_segment seg;
+
+ /*
+ * Create and optionally map 'len' bytes of memory at guest
+ * physical address 'gpa'
+ */
+ bzero(&seg, sizeof(seg));
+ seg.gpa = gpa;
+ seg.len = len;
+ error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
+ if (error == 0 && mapaddr != NULL) {
+ *mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+ ctx->fd, gpa);
+ }
+ return (error);
+}
+
+char *
+vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
+{
+
+ /* Map 'len' bytes of memory at guest physical address 'gpa' */
+ return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+ ctx->fd, gpa));
+}
+
+int
+vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t base, uint32_t limit, uint32_t access)
+{
+ int error;
+ struct vm_seg_desc vmsegdesc;
+
+ bzero(&vmsegdesc, sizeof(vmsegdesc));
+ vmsegdesc.cpuid = vcpu;
+ vmsegdesc.regnum = reg;
+ vmsegdesc.desc.base = base;
+ vmsegdesc.desc.limit = limit;
+ vmsegdesc.desc.access = access;
+
+ error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+ return (error);
+}
+
+int
+vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t *base, uint32_t *limit, uint32_t *access)
+{
+ int error;
+ struct vm_seg_desc vmsegdesc;
+
+ bzero(&vmsegdesc, sizeof(vmsegdesc));
+ vmsegdesc.cpuid = vcpu;
+ vmsegdesc.regnum = reg;
+
+ error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+ if (error == 0) {
+ *base = vmsegdesc.desc.base;
+ *limit = vmsegdesc.desc.limit;
+ *access = vmsegdesc.desc.access;
+ }
+ return (error);
+}
+
+int
+vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ struct vm_register vmreg;
+
+ bzero(&vmreg, sizeof(vmreg));
+ vmreg.cpuid = vcpu;
+ vmreg.regnum = reg;
+ vmreg.regval = val;
+
+ error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
+ return (error);
+}
+
+int
+vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
+{
+ int error;
+ struct vm_register vmreg;
+
+ bzero(&vmreg, sizeof(vmreg));
+ vmreg.cpuid = vcpu;
+ vmreg.regnum = reg;
+
+ error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
+ *ret_val = vmreg.regval;
+ return (error);
+}
+
+int
+vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid)
+{
+ int error;
+ struct vm_pin vmpin;
+
+ bzero(&vmpin, sizeof(vmpin));
+ vmpin.vm_cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin);
+ *host_cpuid = vmpin.host_cpuid;
+ return (error);
+}
+
+int
+vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid)
+{
+ int error;
+ struct vm_pin vmpin;
+
+ bzero(&vmpin, sizeof(vmpin));
+ vmpin.vm_cpuid = vcpu;
+ vmpin.host_cpuid = host_cpuid;
+
+ error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin);
+ return (error);
+}
+
+int
+vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
+{
+ int error;
+ struct vm_run vmrun;
+
+ bzero(&vmrun, sizeof(vmrun));
+ vmrun.cpuid = vcpu;
+ vmrun.rip = rip;
+
+ error = ioctl(ctx->fd, VM_RUN, &vmrun);
+ bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
+ return (error);
+}
+
+static int
+vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector, int error_code, int error_code_valid)
+{
+ struct vm_event ev;
+
+ bzero(&ev, sizeof(ev));
+ ev.cpuid = vcpu;
+ ev.type = type;
+ ev.vector = vector;
+ ev.error_code = error_code;
+ ev.error_code_valid = error_code_valid;
+
+ return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev));
+}
+
+int
+vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector)
+{
+
+ return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0));
+}
+
+int
+vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector, int error_code)
+{
+
+ return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1));
+}
+
+int
+vm_apicid2vcpu(struct vmctx *ctx, int apicid)
+{
+ /*
+ * The apic id associated with the 'vcpu' has the same numerical value
+ * as the 'vcpu' itself.
+ */
+ return (apicid);
+}
+
+int
+vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
+{
+ struct vm_lapic_irq vmirq;
+
+ bzero(&vmirq, sizeof(vmirq));
+ vmirq.cpuid = vcpu;
+ vmirq.vector = vector;
+
+ return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
+}
+
+int
+vm_inject_nmi(struct vmctx *ctx, int vcpu)
+{
+ struct vm_nmi vmnmi;
+
+ bzero(&vmnmi, sizeof(vmnmi));
+ vmnmi.cpuid = vcpu;
+
+ return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
+}
+
+static struct {
+ const char *name;
+ int type;
+} capstrmap[] = {
+ { "hlt_exit", VM_CAP_HALT_EXIT },
+ { "mtrap_exit", VM_CAP_MTRAP_EXIT },
+ { "pause_exit", VM_CAP_PAUSE_EXIT },
+ { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
+ { 0 }
+};
+
+int
+vm_capability_name2type(const char *capname)
+{
+ int i;
+
+ for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
+ if (strcmp(capstrmap[i].name, capname) == 0)
+ return (capstrmap[i].type);
+ }
+
+ return (-1);
+}
+
+const char *
+vm_capability_type2name(int type)
+{
+ int i;
+
+ for (i = 0; capstrmap[i].name != NULL; i++) {
+ if (capstrmap[i].type == type)
+ return (capstrmap[i].name);
+ }
+
+ return (NULL);
+}
+
+int
+vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int *retval)
+{
+ int error;
+ struct vm_capability vmcap;
+
+ bzero(&vmcap, sizeof(vmcap));
+ vmcap.cpuid = vcpu;
+ vmcap.captype = cap;
+
+ error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
+ *retval = vmcap.capval;
+ return (error);
+}
+
+int
+vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
+{
+ struct vm_capability vmcap;
+
+ bzero(&vmcap, sizeof(vmcap));
+ vmcap.cpuid = vcpu;
+ vmcap.captype = cap;
+ vmcap.capval = val;
+
+ return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
+}
+
+int
+vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+ struct vm_pptdev pptdev;
+
+ bzero(&pptdev, sizeof(pptdev));
+ pptdev.bus = bus;
+ pptdev.slot = slot;
+ pptdev.func = func;
+
+ return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
+}
+
+int
+vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+ struct vm_pptdev pptdev;
+
+ bzero(&pptdev, sizeof(pptdev));
+ pptdev.bus = bus;
+ pptdev.slot = slot;
+ pptdev.func = func;
+
+ return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
+}
+
+int
+vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ struct vm_pptdev_mmio pptmmio;
+
+ bzero(&pptmmio, sizeof(pptmmio));
+ pptmmio.bus = bus;
+ pptmmio.slot = slot;
+ pptmmio.func = func;
+ pptmmio.gpa = gpa;
+ pptmmio.len = len;
+ pptmmio.hpa = hpa;
+
+ return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
+}
+
+int
+vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec)
+{
+ struct vm_pptdev_msi pptmsi;
+
+ bzero(&pptmsi, sizeof(pptmsi));
+ pptmsi.vcpu = vcpu;
+ pptmsi.bus = bus;
+ pptmsi.slot = slot;
+ pptmsi.func = func;
+ pptmsi.destcpu = destcpu;
+ pptmsi.vector = vector;
+ pptmsi.numvec = numvec;
+
+ return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
+}
+
+int
+vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+ struct vm_pptdev_msix pptmsix;
+
+ bzero(&pptmsix, sizeof(pptmsix));
+ pptmsix.vcpu = vcpu;
+ pptmsix.bus = bus;
+ pptmsix.slot = slot;
+ pptmsix.func = func;
+ pptmsix.idx = idx;
+ pptmsix.msg = msg;
+ pptmsix.addr = addr;
+ pptmsix.vector_control = vector_control;
+
+ return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
+}
+
+uint64_t *
+vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+ int *ret_entries)
+{
+ int error;
+
+ static struct vm_stats vmstats;
+
+ vmstats.cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_STATS, &vmstats);
+ if (error == 0) {
+ if (ret_entries)
+ *ret_entries = vmstats.num_entries;
+ if (ret_tv)
+ *ret_tv = vmstats.tv;
+ return (vmstats.statbuf);
+ } else
+ return (NULL);
+}
+
+const char *
+vm_get_stat_desc(struct vmctx *ctx, int index)
+{
+ static struct vm_stat_desc statdesc;
+
+ statdesc.index = index;
+ if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
+ return (statdesc.desc);
+ else
+ return (NULL);
+}
+
+int
+vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
+{
+ int error;
+ struct vm_x2apic x2apic;
+
+ bzero(&x2apic, sizeof(x2apic));
+ x2apic.cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
+ *state = x2apic.state;
+ return (error);
+}
+
+int
+vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
+{
+ int error;
+ struct vm_x2apic x2apic;
+
+ bzero(&x2apic, sizeof(x2apic));
+ x2apic.cpuid = vcpu;
+ x2apic.state = state;
+
+ error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
+
+ return (error);
+}
+
+/*
+ * From Intel Vol 3a:
+ * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
+ */
+int
+vcpu_reset(struct vmctx *vmctx, int vcpu)
+{
+ int error;
+ uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
+ uint32_t desc_access, desc_limit;
+ uint16_t sel;
+
+ zero = 0;
+
+ rflags = 0x2;
+ error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+ if (error)
+ goto done;
+
+ rip = 0xfff0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+ goto done;
+
+ cr0 = CR0_NE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
+ goto done;
+
+ cr4 = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+ goto done;
+
+ /*
+ * CS: present, r/w, accessed, 16-bit, byte granularity, usable
+ */
+ desc_base = 0xffff0000;
+ desc_limit = 0xffff;
+ desc_access = 0x0093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0xf000;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
+ goto done;
+
+ /*
+ * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
+ */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x0093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
+ goto done;
+
+ /* General purpose registers */
+ rdx = 0xf00;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
+ goto done;
+
+ /* GDTR, IDTR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, desc_access);
+ if (error != 0)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, desc_access);
+ if (error != 0)
+ goto done;
+
+ /* TR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x0000008b;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
+ goto done;
+
+ /* LDTR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x00000082;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
+ desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+ goto done;
+
+ /* XXX cr2, debug registers */
+
+ error = 0;
+done:
+ return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
new file mode 100644
index 0000000..de04252
--- /dev/null
+++ b/lib/libvmmapi/vmmapi.h
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMMAPI_H_
+#define _VMMAPI_H_
+
+struct vmctx;
+enum x2apic_state;
+
+int vm_create(const char *name);
+struct vmctx *vm_open(const char *name);
+void vm_destroy(struct vmctx *ctx);
+size_t vmm_get_mem_total(void);
+size_t vmm_get_mem_free(void);
+int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
+/*
+ * Create a memory segment of 'len' bytes in the guest physical address space
+ * at offset 'gpa'.
+ *
+ * If 'mapaddr' is not NULL then this region is mmap'ed into the address
+ * space of the calling process. If there is an mmap error then *mapaddr
+ * will be set to MAP_FAILED.
+ */
+
+int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len,
+ char **mapaddr);
+char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
+int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t base, uint32_t limit, uint32_t access);
+int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t *base, uint32_t *limit, uint32_t *access);
+int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
+int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
+int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid);
+int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid);
+int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
+ struct vm_exit *ret_vmexit);
+int vm_apicid2vcpu(struct vmctx *ctx, int apicid);
+int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector);
+int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector, int error_code);
+int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
+int vm_inject_nmi(struct vmctx *ctx, int vcpu);
+int vm_capability_name2type(const char *capname);
+const char *vm_capability_type2name(int type);
+int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int *retval);
+int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int val);
+int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int dest, int vector, int numvec);
+int vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+
+/*
+ * Return a pointer to the statistics buffer. Note that this is not MT-safe.
+ */
+uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+ int *ret_entries);
+const char *vm_get_stat_desc(struct vmctx *ctx, int index);
+
+int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
+int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
+
+/* Reset vcpu register state */
+int vcpu_reset(struct vmctx *ctx, int vcpu);
+
+/*
+ * FreeBSD specific APIs
+ */
+int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
+ uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+ uint64_t rsp);
+void vm_setup_freebsd_gdt(uint64_t *gdtr);
+#endif /* _VMMAPI_H_ */
diff --git a/lib/libvmmapi/vmmapi_freebsd.c b/lib/libvmmapi/vmmapi_freebsd.c
new file mode 100644
index 0000000..9bd2988
--- /dev/null
+++ b/lib/libvmmapi/vmmapi_freebsd.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/specialreg.h>
+#include <machine/segments.h>
+#include <machine/vmm.h>
+
+#include "vmmapi.h"
+
+#define DESC_UNUSABLE 0x00010000
+
+#define GUEST_NULL_SEL 0
+#define GUEST_CODE_SEL 1
+#define GUEST_DATA_SEL 2
+#define GUEST_GDTR_LIMIT (3 * 8 - 1)
+
+void
+vm_setup_freebsd_gdt(uint64_t *gdtr)
+{
+ gdtr[GUEST_NULL_SEL] = 0;
+ gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
+ gdtr[GUEST_DATA_SEL] = 0x0000900000000000;
+}
+
+/*
+ * Setup the 'vcpu' register set such that it will begin execution at
+ * 'rip' in long mode.
+ */
+int
+vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
+ uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+ uint64_t rsp)
+{
+ int error;
+ uint64_t cr0, cr4, efer, rflags, desc_base;
+ uint32_t desc_access, desc_limit;
+ uint16_t gsel;
+
+ cr0 = CR0_PE | CR0_PG | CR0_NE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = CR4_PAE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+ goto done;
+
+ efer = EFER_LME | EFER_LMA;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
+ goto done;
+
+ rflags = 0x2;
+ error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+ if (error)
+ goto done;
+
+ desc_base = 0;
+ desc_limit = 0;
+ desc_access = 0x0000209B;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ desc_access = 0x00000093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ /*
+ * XXX TR is pointing to null selector even though we set the
+ * TSS segment to be usable with a base address and limit of 0.
+ */
+ desc_access = 0x0000008b;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
+ DESC_UNUSABLE);
+ if (error)
+ goto done;
+
+ gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
+ goto done;
+
+ gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
+ goto done;
+
+ /* XXX TR is pointing to the null selector */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0)
+ goto done;
+
+ /* LDTR is pointing to the null selector */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+ goto done;
+
+ /* entry point */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+ goto done;
+
+ /* page table base */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
+ goto done;
+
+ desc_base = gdtbase;
+ desc_limit = GUEST_GDTR_LIMIT;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, 0);
+ if (error != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ return (error);
+}
diff --git a/share/man/man4/bhyve.4 b/share/man/man4/bhyve.4
new file mode 100644
index 0000000..cdfc1e2
--- /dev/null
+++ b/share/man/man4/bhyve.4
@@ -0,0 +1,68 @@
+.\"
+.\" Copyright (c) 2012 NetApp Inc
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd January 5, 2013
+.Dt BHYVE 4
+.Os
+.Sh NAME
+.Nm bhyve
+.Nd virtual machine monitor
+.Sh SYNOPSIS
+.Cd "/usr/sbin/bhyve"
+.Cd "/usr/sbin/bhyveload"
+.Cd "/usr/sbin/bhyvectl"
+.Cd "/boot/kernel/vmm.ko"
+.Sh DESCRIPTION
+.Nm
+is a virtual machine monitor that is hosted by FreeBSD. It is used to host
+unmodified guest operating systems on top of FreeBSD.
+.Pp
+.Nm
+relies heavily on hardware assist provided by the CPU and chipset to virtualize
+processor and memory resources.
+.Sh SEE ALSO
+.Xr bhyve 8 ,
+.Xr bhyveload 8 ,
+.Xr bhyvectl 8 ,
+.Xr vmm 4
+.Sh HISTORY
+.Nm
+first appeared in
+.Fx 10.0 ,
+and was developed at NetApp Inc.
+.Sh AUTHORS
+.Nm
+was developed by
+.An -nosplit
+.An "Peter Grehan" Aq grehan@FreeBSD.org
+and
+.An "Neel Natu" Aq neel@FreeBSD.org
+at NetApp Inc.
+.Sh BUGS
+.Nm
+is considered experimental in
+.Fx .
diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk
index 4f8bedd..95f9064 100644
--- a/share/mk/bsd.libnames.mk
+++ b/share/mk/bsd.libnames.mk
@@ -162,6 +162,7 @@ LIBULOG?= ${DESTDIR}${LIBDIR}/libulog.a
LIBUTIL?= ${DESTDIR}${LIBDIR}/libutil.a
LIBUUTIL?= ${DESTDIR}${LIBDIR}/libuutil.a
LIBVGL?= ${DESTDIR}${LIBDIR}/libvgl.a
+LIBVMMAPI?= ${DESTDIR}${LIBDIR}/libvmmapi.a
LIBWIND?= ${DESTDIR}${LIBDIR}/libwind.a
LIBWRAP?= ${DESTDIR}${LIBDIR}/libwrap.a
LIBXPG4?= ${DESTDIR}${LIBDIR}/libxpg4.a
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 0000000..024c30e
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,293 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_H_
+#define _VMM_H_
+
+#ifdef _KERNEL
+
+#define VM_MAX_NAMELEN 32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+enum x2apic_state;
+
+typedef int (*vmm_init_func_t)(void);
+typedef int (*vmm_cleanup_func_t)(void);
+typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void (*vmi_cleanup_func_t)(void *vmi);
+typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+ vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot,
+ boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
+typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
+ uint64_t *retval);
+typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
+ uint64_t val);
+typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
+ int type, int vector,
+ uint32_t code, int code_valid);
+typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+ vmm_init_func_t init; /* module wide initialization */
+ vmm_cleanup_func_t cleanup;
+
+ vmi_init_func_t vminit; /* vm-specific initialization */
+ vmi_run_func_t vmrun;
+ vmi_cleanup_func_t vmcleanup;
+ vmi_mmap_set_func_t vmmmap_set;
+ vmi_mmap_get_func_t vmmmap_get;
+ vmi_get_register_t vmgetreg;
+ vmi_set_register_t vmsetreg;
+ vmi_get_desc_t vmgetdesc;
+ vmi_set_desc_t vmsetdesc;
+ vmi_inject_event_t vminject;
+ vmi_get_cap_t vmgetcap;
+ vmi_set_cap_t vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+struct vm *vm_create(const char *name);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc);
+int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
+int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+ int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+ VCPU_IDLE,
+ VCPU_RUNNING,
+ VCPU_CANNOT_RUN,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu)
+{
+ return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING);
+}
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif /* KERNEL */
+
+#include <machine/vmm_instruction_emul.h>
+
+#define VM_MAXCPU 8 /* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+ VM_EVENT_NONE,
+ VM_HW_INTR,
+ VM_NMI,
+ VM_HW_EXCEPTION,
+ VM_SW_INTR,
+ VM_PRIV_SW_EXCEPTION,
+ VM_SW_EXCEPTION,
+ VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_CR0,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_DR7,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RIP,
+ VM_REG_GUEST_RFLAGS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ VM_REG_GUEST_LDTR,
+ VM_REG_GUEST_TR,
+ VM_REG_GUEST_IDTR,
+ VM_REG_GUEST_GDTR,
+ VM_REG_GUEST_EFER,
+ VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+ VM_CAP_HALT_EXIT,
+ VM_CAP_MTRAP_EXIT,
+ VM_CAP_PAUSE_EXIT,
+ VM_CAP_UNRESTRICTED_GUEST,
+ VM_CAP_MAX
+};
+
+enum x2apic_state {
+ X2APIC_ENABLED,
+ X2APIC_AVAILABLE,
+ X2APIC_DISABLED,
+ X2APIC_STATE_LAST
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+ uint64_t base;
+ uint32_t limit;
+ uint32_t access;
+};
+
+enum vm_exitcode {
+ VM_EXITCODE_INOUT,
+ VM_EXITCODE_VMX,
+ VM_EXITCODE_BOGUS,
+ VM_EXITCODE_RDMSR,
+ VM_EXITCODE_WRMSR,
+ VM_EXITCODE_HLT,
+ VM_EXITCODE_MTRAP,
+ VM_EXITCODE_PAUSE,
+ VM_EXITCODE_PAGING,
+ VM_EXITCODE_SPINUP_AP,
+ VM_EXITCODE_MAX
+};
+
+struct vm_exit {
+ enum vm_exitcode exitcode;
+ int inst_length; /* 0 means unknown */
+ uint64_t rip;
+ union {
+ struct {
+ uint16_t bytes:3; /* 1 or 2 or 4 */
+ uint16_t in:1; /* out is 0, in is 1 */
+ uint16_t string:1;
+ uint16_t rep:1;
+ uint16_t port;
+ uint32_t eax; /* valid for out */
+ } inout;
+ struct {
+ uint64_t gpa;
+ struct vie vie;
+ } paging;
+ /*
+ * VMX specific payload. Used when there is no "better"
+ * exitcode to represent the VM-exit.
+ */
+ struct {
+ int error; /* vmx inst error */
+ uint32_t exit_reason;
+ uint64_t exit_qualification;
+ } vmx;
+ struct {
+ uint32_t code; /* ecx value */
+ uint64_t wval;
+ } msr;
+ struct {
+ int vcpu;
+ uint64_t rip;
+ } spinup_ap;
+ } u;
+};
+
+#endif /* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 0000000..79f893d
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_DEV_H_
+#define _VMM_DEV_H_
+
+#ifdef _KERNEL
+void vmmdev_init(void);
+int vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+ vm_paddr_t gpa; /* in */
+ size_t len; /* in */
+};
+
+struct vm_register {
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ uint64_t regval;
+};
+
+struct vm_seg_desc { /* data or code segment */
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ struct seg_desc desc;
+};
+
+struct vm_pin {
+ int vm_cpuid;
+ int host_cpuid; /* -1 to unpin */
+};
+
+struct vm_run {
+ int cpuid;
+ uint64_t rip; /* start running here */
+ struct vm_exit vm_exit;
+};
+
+struct vm_event {
+ int cpuid;
+ enum vm_event_type type;
+ int vector;
+ uint32_t error_code;
+ int error_code_valid;
+};
+
+struct vm_lapic_irq {
+ int cpuid;
+ int vector;
+};
+
+struct vm_capability {
+ int cpuid;
+ enum vm_cap_type captype;
+ int capval;
+ int allcpus;
+};
+
+struct vm_pptdev {
+ int bus;
+ int slot;
+ int func;
+};
+
+struct vm_pptdev_mmio {
+ int bus;
+ int slot;
+ int func;
+ vm_paddr_t gpa;
+ vm_paddr_t hpa;
+ size_t len;
+};
+
+struct vm_pptdev_msi {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int numvec; /* 0 means disabled */
+ int vector;
+ int destcpu;
+};
+
+struct vm_pptdev_msix {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int idx;
+ uint32_t msg;
+ uint32_t vector_control;
+ uint64_t addr;
+};
+
+struct vm_nmi {
+ int cpuid;
+};
+
+#define MAX_VM_STATS 64
+struct vm_stats {
+ int cpuid; /* in */
+ int num_entries; /* out */
+ struct timeval tv;
+ uint64_t statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+ int index; /* in */
+ char desc[128]; /* out */
+};
+
+struct vm_x2apic {
+ int cpuid;
+ enum x2apic_state state;
+};
+
+enum {
+ IOCNUM_RUN,
+ IOCNUM_SET_PINNING,
+ IOCNUM_GET_PINNING,
+ IOCNUM_MAP_MEMORY,
+ IOCNUM_GET_MEMORY_SEG,
+ IOCNUM_SET_REGISTER,
+ IOCNUM_GET_REGISTER,
+ IOCNUM_SET_SEGMENT_DESCRIPTOR,
+ IOCNUM_GET_SEGMENT_DESCRIPTOR,
+ IOCNUM_INJECT_EVENT,
+ IOCNUM_LAPIC_IRQ,
+ IOCNUM_SET_CAPABILITY,
+ IOCNUM_GET_CAPABILITY,
+ IOCNUM_BIND_PPTDEV,
+ IOCNUM_UNBIND_PPTDEV,
+ IOCNUM_MAP_PPTDEV_MMIO,
+ IOCNUM_PPTDEV_MSI,
+ IOCNUM_PPTDEV_MSIX,
+ IOCNUM_INJECT_NMI,
+ IOCNUM_VM_STATS,
+ IOCNUM_VM_STAT_DESC,
+ IOCNUM_SET_X2APIC_STATE,
+ IOCNUM_GET_X2APIC_STATE,
+};
+
+#define VM_RUN \
+ _IOWR('v', IOCNUM_RUN, struct vm_run)
+#define VM_SET_PINNING \
+ _IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
+#define VM_GET_PINNING \
+ _IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
+#define VM_MAP_MEMORY \
+ _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define VM_GET_MEMORY_SEG \
+ _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define VM_SET_REGISTER \
+ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define VM_GET_REGISTER \
+ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define VM_SET_SEGMENT_DESCRIPTOR \
+ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_GET_SEGMENT_DESCRIPTOR \
+ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_INJECT_EVENT \
+ _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define VM_LAPIC_IRQ \
+ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define VM_SET_CAPABILITY \
+ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define VM_GET_CAPABILITY \
+ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define VM_BIND_PPTDEV \
+ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define VM_UNBIND_PPTDEV \
+ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define VM_MAP_PPTDEV_MMIO \
+ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define VM_PPTDEV_MSI \
+ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_PPTDEV_MSIX \
+ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_INJECT_NMI \
+ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define VM_STATS \
+ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define VM_STAT_DESC \
+ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define VM_SET_X2APIC_STATE \
+ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_X2APIC_STATE \
+ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..4cc494b
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1;
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t mrr, mem_region_write_t mrw,
+ void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+ uint64_t rip, int inst_length, uint64_t cr3,
+ struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+ uint64_t gla, struct vie *vie);
+#endif /* _KERNEL */
+
+#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 0000000..dc071d3
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,265 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+ printf("amdv_init: not implemented\n");
+ return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+ printf("amdv_cleanup: not implemented\n");
+ return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+ printf("amdv_vminit: not implemented\n");
+ return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip)
+{
+
+ printf("amdv_vmrun: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+ printf("amdv_vmcleanup: not implemented\n");
+ return;
+}
+
+static int
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+ printf("amdv_vmmmap_set: not implemented\n");
+ return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+ printf("amdv_vmmmap_get: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+
+ printf("amdv_getreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+
+ printf("amdv_setreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+ uint32_t error_code, int error_code_valid)
+{
+
+ printf("amdv_inject_event: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+ printf("amdv_getcap: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+ printf("amdv_setcap: not implemented\n");
+ return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+ amdv_init,
+ amdv_cleanup,
+ amdv_vminit,
+ amdv_vmrun,
+ amdv_vmcleanup,
+ amdv_vmmmap_set,
+ amdv_vmmmap_get,
+ amdv_getreg,
+ amdv_setreg,
+ amdv_getdesc,
+ amdv_setdesc,
+ amdv_inject_event,
+ amdv_getcap,
+ amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+ printf("amd_iommu_init: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+ printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+ printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+ printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ printf("amd_iommu_create_domain: not implemented\n");
+ return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+ printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+ uint64_t len)
+{
+
+ printf("amd_iommu_create_mapping: not implemented\n");
+ return (0);
+}
+
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+ printf("amd_iommu_remove_mapping: not implemented\n");
+ return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_remove_device: not implemented\n");
+}
+
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+ printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+ amd_iommu_init,
+ amd_iommu_cleanup,
+ amd_iommu_enable,
+ amd_iommu_disable,
+ amd_iommu_create_domain,
+ amd_iommu_destroy_domain,
+ amd_iommu_create_mapping,
+ amd_iommu_remove_mapping,
+ amd_iommu_add_device,
+ amd_iommu_remove_device,
+ amd_iommu_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..4f91601
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define EPT_PWL4(cap) ((cap) & (1UL << 6))
+#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
+#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
+#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
+#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
+#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
+
+#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
+#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define INVEPT_ALL_TYPES_MASK 0x6000000UL
+#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define EPT_PG_RD (1 << 0)
+#define EPT_PG_WR (1 << 1)
+#define EPT_PG_EX (1 << 2)
+#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
+#define EPT_PG_IGNORE_PAT (1 << 6)
+#define EPT_PG_SUPERPAGE (1 << 7)
+
+#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+ int page_shift;
+ uint64_t cap;
+
+ cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+ /*
+ * Verify that:
+ * - page walk length is 4 steps
+ * - extended page tables can be laid out in write-back memory
+ * - invvpid instruction with all possible types is supported
+ * - invept instruction with all possible types is supported
+ */
+ if (!EPT_PWL4(cap) ||
+ !EPT_MEMORY_TYPE_WB(cap) ||
+ !INVVPID_SUPPORTED(cap) ||
+ !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+ !INVEPT_SUPPORTED(cap) ||
+ !INVEPT_ALL_TYPES_SUPPORTED(cap))
+ return (EINVAL);
+
+ /* Set bits in 'page_sizes_mask' for each valid page size */
+ page_shift = PAGE_SHIFT;
+ page_sizes_mask = 1UL << page_shift; /* 4KB page */
+
+ page_shift += 9;
+ if (EPT_PDE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
+
+ page_shift += 9;
+ if (EPT_PDPTE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
+
+ return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+ int i, t, tabs;
+ uint64_t *ptpnext, ptpval;
+
+ if (--nlevels < 0)
+ return;
+
+ tabs = 3 - nlevels;
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("PTP = %p\n", ptp);
+
+ for (i = 0; i < 512; i++) {
+ ptpval = ptp[i];
+
+ if (ptpval == 0)
+ continue;
+
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("%3d 0x%016lx\n", i, ptpval);
+
+ if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+ ptpnext = (uint64_t *)
+ PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ ept_dump(ptpnext, nlevels);
+ }
+ }
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+ int spshift, ptpshift, ptpindex, nlevels;
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - super page sizes supported by the processor
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = PAGE_SHIFT;
+ if (spok)
+ spshift += (EPT_PWLEVELS - 1) * 9;
+ while (spshift >= PAGE_SHIFT) {
+ uint64_t spsize = 1UL << spshift;
+ if ((page_sizes_mask & spsize) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ length >= spsize) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ if (spshift < PAGE_SHIFT) {
+ panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+ "length 0x%016lx, page_sizes_mask 0x%016lx",
+ gpa, hpa, length, page_sizes_mask);
+ }
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift)
+ break;
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create the next level page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp);
+ ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+ panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+ "mismatch\n", gpa, ptpshift);
+ }
+
+ if (prot != VM_PROT_NONE) {
+ /* Do the mapping */
+ ptp[ptpindex] = hpa;
+
+ /* Apply the access controls */
+ if (prot & VM_PROT_READ)
+ ptp[ptpindex] |= EPT_PG_RD;
+ if (prot & VM_PROT_WRITE)
+ ptp[ptpindex] |= EPT_PG_WR;
+ if (prot & VM_PROT_EXECUTE)
+ ptp[ptpindex] |= EPT_PG_EX;
+
+ /*
+ * XXX should we enforce this memory type by setting the
+ * ignore PAT bit to 1.
+ */
+ ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+ } else {
+ /* Remove the mapping */
+ ptp[ptpindex] = 0;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+ int nlevels, ptpshift, ptpindex;
+ uint64_t ptpval, hpabase, pgmask;
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ ptpval = ptp[ptpindex];
+
+ /* Cannot make progress beyond this point */
+ if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+ break;
+
+ if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+ pgmask = (1UL << ptpshift) - 1;
+ hpabase = ptpval & ~pgmask;
+ return (hpabase | (gpa & pgmask));
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ }
+
+ return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+ if (pte == 0)
+ return;
+
+ /* sanity check */
+ if ((pte & EPT_PG_SUPERPAGE) != 0)
+ panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+ return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+ pt_entry_t *pt;
+ int i;
+
+ if (pde == 0)
+ return;
+
+ if ((pde & EPT_PG_SUPERPAGE) == 0) {
+ pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+ for (i = 0; i < NPTEPG; i++)
+ ept_free_pt_entry(pt[i]);
+ free(pt, M_VMX); /* free the page table page */
+ }
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+ pd_entry_t *pd;
+ int i;
+
+ if (pdpe == 0)
+ return;
+
+ if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+ pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+ for (i = 0; i < NPDEPG; i++)
+ ept_free_pd_entry(pd[i]);
+ free(pd, M_VMX); /* free the page directory page */
+ }
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+ pdp_entry_t *pdp;
+ int i;
+
+ if (pml4e == 0)
+ return;
+
+ if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+ for (i = 0; i < NPDPEPG; i++)
+ ept_free_pdp_entry(pdp[i]);
+ free(pdp, M_VMX); /* free the page directory ptr page */
+ }
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+ int i;
+
+ for (i = 0; i < NPML4EPG; i++)
+ ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+ size_t n;
+ struct vmx *vmx = arg;
+
+ while (len > 0) {
+ n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+ prot, spok);
+ len -= n;
+ gpa += n;
+ hpa += n;
+ }
+
+ return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+ vm_paddr_t hpa;
+ struct vmx *vmx;
+
+ vmx = arg;
+ hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+ return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+ struct invept_desc desc = *(struct invept_desc *)arg;
+
+ invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+ struct invept_desc invept_desc = { 0 };
+
+ invept_desc.eptp = EPTP(pml4ept);
+
+ smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..2d7258d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EPT_H_
+#define _EPT_H_
+
+struct vmx;
+
+#define EPT_PWLEVELS 4 /* page walk levels */
+#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int ept_init(void);
+int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void ept_invalidate_mappings(u_long ept_pml4);
+void ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..a5784dd
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,551 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+ switch (encoding) {
+ case VMCS_GUEST_CR0:
+ val = vmx_fix_cr0(val);
+ break;
+ case VMCS_GUEST_CR4:
+ val = vmx_fix_cr4(val);
+ break;
+ default:
+ break;
+ }
+ return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+ switch (ident) {
+ case VM_REG_GUEST_CR0:
+ return (VMCS_GUEST_CR0);
+ case VM_REG_GUEST_CR3:
+ return (VMCS_GUEST_CR3);
+ case VM_REG_GUEST_CR4:
+ return (VMCS_GUEST_CR4);
+ case VM_REG_GUEST_DR7:
+ return (VMCS_GUEST_DR7);
+ case VM_REG_GUEST_RSP:
+ return (VMCS_GUEST_RSP);
+ case VM_REG_GUEST_RIP:
+ return (VMCS_GUEST_RIP);
+ case VM_REG_GUEST_RFLAGS:
+ return (VMCS_GUEST_RFLAGS);
+ case VM_REG_GUEST_ES:
+ return (VMCS_GUEST_ES_SELECTOR);
+ case VM_REG_GUEST_CS:
+ return (VMCS_GUEST_CS_SELECTOR);
+ case VM_REG_GUEST_SS:
+ return (VMCS_GUEST_SS_SELECTOR);
+ case VM_REG_GUEST_DS:
+ return (VMCS_GUEST_DS_SELECTOR);
+ case VM_REG_GUEST_FS:
+ return (VMCS_GUEST_FS_SELECTOR);
+ case VM_REG_GUEST_GS:
+ return (VMCS_GUEST_GS_SELECTOR);
+ case VM_REG_GUEST_TR:
+ return (VMCS_GUEST_TR_SELECTOR);
+ case VM_REG_GUEST_LDTR:
+ return (VMCS_GUEST_LDTR_SELECTOR);
+ case VM_REG_GUEST_EFER:
+ return (VMCS_GUEST_IA32_EFER);
+ default:
+ return (-1);
+ }
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+ switch (seg) {
+ case VM_REG_GUEST_ES:
+ *base = VMCS_GUEST_ES_BASE;
+ *lim = VMCS_GUEST_ES_LIMIT;
+ *acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_CS:
+ *base = VMCS_GUEST_CS_BASE;
+ *lim = VMCS_GUEST_CS_LIMIT;
+ *acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_SS:
+ *base = VMCS_GUEST_SS_BASE;
+ *lim = VMCS_GUEST_SS_LIMIT;
+ *acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_DS:
+ *base = VMCS_GUEST_DS_BASE;
+ *lim = VMCS_GUEST_DS_LIMIT;
+ *acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_FS:
+ *base = VMCS_GUEST_FS_BASE;
+ *lim = VMCS_GUEST_FS_LIMIT;
+ *acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_GS:
+ *base = VMCS_GUEST_GS_BASE;
+ *lim = VMCS_GUEST_GS_LIMIT;
+ *acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_TR:
+ *base = VMCS_GUEST_TR_BASE;
+ *lim = VMCS_GUEST_TR_LIMIT;
+ *acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_LDTR:
+ *base = VMCS_GUEST_LDTR_BASE;
+ *lim = VMCS_GUEST_LDTR_LIMIT;
+ *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_IDTR:
+ *base = VMCS_GUEST_IDTR_BASE;
+ *lim = VMCS_GUEST_IDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ case VM_REG_GUEST_GDTR:
+ *base = VMCS_GUEST_GDTR_BASE;
+ *lim = VMCS_GUEST_GDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+ int error;
+ uint32_t encoding;
+
+ /*
+ * If we need to get at vmx-specific state in the VMCS we can bypass
+ * the translation of 'ident' to 'encoding' by simply setting the
+ * sign bit. As it so happens the upper 16 bits are reserved (i.e
+ * set to 0) in the encodings for the VMCS so we are free to use the
+ * sign bit.
+ */
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ VMPTRLD(vmcs);
+ error = vmread(encoding, retval);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+ int error;
+ uint32_t encoding;
+
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ val = vmcs_fix_regval(encoding, val);
+
+ VMPTRLD(vmcs);
+ error = vmwrite(encoding, val);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_setdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmwrite(base, desc->base)) != 0)
+ goto done;
+
+ if ((error = vmwrite(limit, desc->limit)) != 0)
+ goto done;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmwrite(access, desc->access)) != 0)
+ goto done;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+ uint64_t u64;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_getdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmread(base, &u64)) != 0)
+ goto done;
+ desc->base = u64;
+
+ if ((error = vmread(limit, &u64)) != 0)
+ goto done;
+ desc->limit = u64;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmread(access, &u64)) != 0)
+ goto done;
+ desc->access = u64;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+ int error;
+
+ VMPTRLD(vmcs);
+
+ /*
+ * Guest MSRs are saved in the VM-exit MSR-store area.
+ * Guest MSRs are loaded from the VM-entry MSR-load area.
+ * Both areas point to the same location in memory.
+ */
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+ u_long host_rip, u_long host_rsp, u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+ int error, codesel, datasel, tsssel;
+ u_long cr0, cr4, efer;
+ uint64_t eptp, pat, fsbase, idtrbase;
+ uint32_t exc_bitmap;
+
+ codesel = vmm_get_host_codesel();
+ datasel = vmm_get_host_datasel();
+ tsssel = vmm_get_host_tsssel();
+
+ /*
+ * Make sure we have a "current" VMCS to work with.
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * Load the VMX controls
+ */
+ if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+ goto done;
+
+ /* Guest state */
+
+ /* Initialize guest IA32_PAT MSR with the default value */
+ pat = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Host state */
+
+ /* Initialize host IA32_PAT MSR */
+ pat = vmm_get_host_pat();
+ if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Load the IA32_EFER MSR */
+ efer = vmm_get_host_efer();
+ if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+ goto done;
+
+ /* Load the control registers */
+
+ cr0 = vmm_get_host_cr0();
+ if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = vmm_get_host_cr4() | CR4_VMXE;
+ if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+ goto done;
+
+ /* Load the segment selectors */
+ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+ goto done;
+
+ /*
+ * Load the Base-Address for %fs and idtr.
+ *
+ * Note that we exclude %gs, tss and gdtr here because their base
+ * address is pcpu specific.
+ */
+ fsbase = vmm_get_host_fsbase();
+ if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+ goto done;
+
+ idtrbase = vmm_get_host_idtrbase();
+ if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+ goto done;
+
+ /* instruction pointer */
+ if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+ goto done;
+
+ /* stack pointer */
+ if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+ goto done;
+
+ /* eptp */
+ eptp = EPTP(ept_pml4);
+ if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+ goto done;
+
+ /* vpid */
+ if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+ goto done;
+
+ /* msr bitmap */
+ if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+ goto done;
+
+ /* exception bitmap */
+ exc_bitmap = 1 << IDT_MC;
+ if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+ goto done;
+
+ /* link pointer */
+ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+ goto done;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+ int error;
+ uint64_t val;
+
+ error = vmread(encoding, &val);
+ if (error != 0)
+ panic("vmcs_read(%u) error %d", encoding, error);
+
+ return (val);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+ uint64_t cur_vmcs, val;
+ uint32_t exit;
+
+ if (!vmxon_enabled[curcpu]) {
+ db_printf("VMX not enabled\n");
+ return;
+ }
+
+ if (have_addr) {
+ db_printf("Only current VMCS supported\n");
+ return;
+ }
+
+ vmptrst(&cur_vmcs);
+ if (cur_vmcs == VMCS_INITIAL) {
+ db_printf("No current VM context\n");
+ return;
+ }
+ db_printf("VMCS: %jx\n", cur_vmcs);
+ db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+ db_printf("Activity: ");
+ val = vmcs_read(VMCS_GUEST_ACTIVITY);
+ switch (val) {
+ case 0:
+ db_printf("Active");
+ break;
+ case 1:
+ db_printf("HLT");
+ break;
+ case 2:
+ db_printf("Shutdown");
+ break;
+ case 3:
+ db_printf("Wait for SIPI");
+ break;
+ default:
+ db_printf("Unknown: %#lx", val);
+ }
+ db_printf("\n");
+ exit = vmcs_read(VMCS_EXIT_REASON);
+ if (exit & 0x80000000)
+ db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+ else
+ db_printf("Exit Reason: %u\n", exit & 0xffff);
+ db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+ db_printf("Guest Linear Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+ switch (exit & 0x8000ffff) {
+ case EXIT_REASON_EXCEPTION:
+ case EXIT_REASON_EXT_INTR:
+ val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+ db_printf("Interrupt Type: ");
+ switch (val >> 8 & 0x7) {
+ case 0:
+ db_printf("external");
+ break;
+ case 2:
+ db_printf("NMI");
+ break;
+ case 3:
+ db_printf("HW exception");
+ break;
+ case 4:
+ db_printf("SW exception");
+ break;
+ default:
+ db_printf("?? %lu", val >> 8 & 0x7);
+ break;
+ }
+ db_printf(" Vector: %lu", val & 0xff);
+ if (val & 0x800)
+ db_printf(" Error Code: %lx",
+ vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+ db_printf("\n");
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ case EXIT_REASON_EPT_MISCONFIG:
+ db_printf("Guest Physical Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+ break;
+ }
+ db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..f39eed2
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,338 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define _VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+ uint32_t identifier;
+ uint32_t abort_code;
+ char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+ u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap,
+ uint16_t vpid);
+int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int vmcs_getdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+int vmcs_setdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
+#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
+#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+
+#endif /* _KERNEL */
+
+#define VMCS_INITIAL 0xffffffffffffffff
+
+#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define VMCS_INVALID_ENCODING 0xffffffff
+
+/* 16-bit control fields */
+#define VMCS_VPID 0x00000000
+
+/* 16-bit guest-state fields */
+#define VMCS_GUEST_ES_SELECTOR 0x00000800
+#define VMCS_GUEST_CS_SELECTOR 0x00000802
+#define VMCS_GUEST_SS_SELECTOR 0x00000804
+#define VMCS_GUEST_DS_SELECTOR 0x00000806
+#define VMCS_GUEST_FS_SELECTOR 0x00000808
+#define VMCS_GUEST_GS_SELECTOR 0x0000080A
+#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
+#define VMCS_GUEST_TR_SELECTOR 0x0000080E
+
+/* 16-bit host-state fields */
+#define VMCS_HOST_ES_SELECTOR 0x00000C00
+#define VMCS_HOST_CS_SELECTOR 0x00000C02
+#define VMCS_HOST_SS_SELECTOR 0x00000C04
+#define VMCS_HOST_DS_SELECTOR 0x00000C06
+#define VMCS_HOST_FS_SELECTOR 0x00000C08
+#define VMCS_HOST_GS_SELECTOR 0x00000C0A
+#define VMCS_HOST_TR_SELECTOR 0x00000C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x00002000
+#define VMCS_IO_BITMAP_B 0x00002002
+#define VMCS_MSR_BITMAP 0x00002004
+#define VMCS_EXIT_MSR_STORE 0x00002006
+#define VMCS_EXIT_MSR_LOAD 0x00002008
+#define VMCS_ENTRY_MSR_LOAD 0x0000200A
+#define VMCS_EXECUTIVE_VMCS 0x0000200C
+#define VMCS_TSC_OFFSET 0x00002010
+#define VMCS_VIRTUAL_APIC 0x00002012
+#define VMCS_APIC_ACCESS 0x00002014
+#define VMCS_EPTP 0x0000201A
+
+/* 64-bit read-only fields */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+
+/* 64-bit guest-state fields */
+#define VMCS_LINK_POINTER 0x00002800
+#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
+#define VMCS_GUEST_IA32_PAT 0x00002804
+#define VMCS_GUEST_IA32_EFER 0x00002806
+#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define VMCS_GUEST_PDPTE0 0x0000280A
+#define VMCS_GUEST_PDPTE1 0x0000280C
+#define VMCS_GUEST_PDPTE2 0x0000280E
+#define VMCS_GUEST_PDPTE3 0x00002810
+
+/* 64-bit host-state fields */
+#define VMCS_HOST_IA32_PAT 0x00002C00
+#define VMCS_HOST_IA32_EFER 0x00002C02
+#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
+
+/* 32-bit control fields */
+#define VMCS_PIN_BASED_CTLS 0x00004000
+#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
+#define VMCS_EXCEPTION_BITMAP 0x00004004
+#define VMCS_PF_ERROR_MASK 0x00004006
+#define VMCS_PF_ERROR_MATCH 0x00004008
+#define VMCS_CR3_TARGET_COUNT 0x0000400A
+#define VMCS_EXIT_CTLS 0x0000400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
+#define VMCS_ENTRY_CTLS 0x00004012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
+#define VMCS_ENTRY_INTR_INFO 0x00004016
+#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
+#define VMCS_ENTRY_INST_LENGTH 0x0000401A
+#define VMCS_TPR_THRESHOLD 0x0000401C
+#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
+#define VMCS_PLE_GAP 0x00004020
+#define VMCS_PLE_WINDOW 0x00004022
+
+/* 32-bit read-only data fields */
+#define VMCS_INSTRUCTION_ERROR 0x00004400
+#define VMCS_EXIT_REASON 0x00004402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
+#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
+#define VMCS_IDT_VECTORING_INFO 0x00004408
+#define VMCS_IDT_VECTORING_ERROR 0x0000440A
+#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
+
+/* 32-bit guest-state fields */
+#define VMCS_GUEST_ES_LIMIT 0x00004800
+#define VMCS_GUEST_CS_LIMIT 0x00004802
+#define VMCS_GUEST_SS_LIMIT 0x00004804
+#define VMCS_GUEST_DS_LIMIT 0x00004806
+#define VMCS_GUEST_FS_LIMIT 0x00004808
+#define VMCS_GUEST_GS_LIMIT 0x0000480A
+#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
+#define VMCS_GUEST_TR_LIMIT 0x0000480E
+#define VMCS_GUEST_GDTR_LIMIT 0x00004810
+#define VMCS_GUEST_IDTR_LIMIT 0x00004812
+#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
+#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
+#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
+#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
+#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
+#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
+#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
+#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
+#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
+#define VMCS_GUEST_ACTIVITY 0x00004826
+#define VMCS_GUEST_SMBASE 0x00004828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
+#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
+
+/* 32-bit host state fields */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
+
+/* Natural Width control fields */
+#define VMCS_CR0_MASK 0x00006000
+#define VMCS_CR4_MASK 0x00006002
+#define VMCS_CR0_SHADOW 0x00006004
+#define VMCS_CR4_SHADOW 0x00006006
+#define VMCS_CR3_TARGET0 0x00006008
+#define VMCS_CR3_TARGET1 0x0000600A
+#define VMCS_CR3_TARGET2 0x0000600C
+#define VMCS_CR3_TARGET3 0x0000600E
+
+/* Natural Width read-only fields */
+#define VMCS_EXIT_QUALIFICATION 0x00006400
+#define VMCS_IO_RCX 0x00006402
+#define VMCS_IO_RSI 0x00006404
+#define VMCS_IO_RDI 0x00006406
+#define VMCS_IO_RIP 0x00006408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
+
+/* Natural Width guest-state fields */
+#define VMCS_GUEST_CR0 0x00006800
+#define VMCS_GUEST_CR3 0x00006802
+#define VMCS_GUEST_CR4 0x00006804
+#define VMCS_GUEST_ES_BASE 0x00006806
+#define VMCS_GUEST_CS_BASE 0x00006808
+#define VMCS_GUEST_SS_BASE 0x0000680A
+#define VMCS_GUEST_DS_BASE 0x0000680C
+#define VMCS_GUEST_FS_BASE 0x0000680E
+#define VMCS_GUEST_GS_BASE 0x00006810
+#define VMCS_GUEST_LDTR_BASE 0x00006812
+#define VMCS_GUEST_TR_BASE 0x00006814
+#define VMCS_GUEST_GDTR_BASE 0x00006816
+#define VMCS_GUEST_IDTR_BASE 0x00006818
+#define VMCS_GUEST_DR7 0x0000681A
+#define VMCS_GUEST_RSP 0x0000681C
+#define VMCS_GUEST_RIP 0x0000681E
+#define VMCS_GUEST_RFLAGS 0x00006820
+#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
+
+/* Natural Width host-state fields */
+#define VMCS_HOST_CR0 0x00006C00
+#define VMCS_HOST_CR3 0x00006C02
+#define VMCS_HOST_CR4 0x00006C04
+#define VMCS_HOST_FS_BASE 0x00006C06
+#define VMCS_HOST_GS_BASE 0x00006C08
+#define VMCS_HOST_TR_BASE 0x00006C0A
+#define VMCS_HOST_GDTR_BASE 0x00006C0C
+#define VMCS_HOST_IDTR_BASE 0x00006C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
+#define VMCS_HOST_RSP 0x00006C14
+#define VMCS_HOST_RIP 0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION 0
+#define EXIT_REASON_EXT_INTR 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT 3
+#define EXIT_REASON_SIPI 4
+#define EXIT_REASON_IO_SMI 5
+#define EXIT_REASON_SMI 6
+#define EXIT_REASON_INTR_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_GETSEC 11
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_RSM 17
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMXOFF 26
+#define EXIT_REASON_VMXON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_INOUT 30
+#define EXIT_REASON_RDMSR 31
+#define EXIT_REASON_WRMSR 32
+#define EXIT_REASON_INVAL_VMCS 33
+#define EXIT_REASON_INVAL_MSR 34
+#define EXIT_REASON_MWAIT 36
+#define EXIT_REASON_MTF 37
+#define EXIT_REASON_MONITOR 39
+#define EXIT_REASON_PAUSE 40
+#define EXIT_REASON_MCE 41
+#define EXIT_REASON_TPR 43
+#define EXIT_REASON_APIC 44
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_FAULT 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_VMX_PREEMPT 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define VMCS_INTERRUPTION_INFO_VALID (1U << 31)
+#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
+#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
+#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
+#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
+#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define EPT_VIOLATION_DATA_READ (1UL << 0)
+#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
+#define EPT_VIOLATION_INST_FETCH (1UL << 2)
+#define EPT_VIOLATION_GLA_VALID (1UL << 7)
+#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..4f267bb
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1845 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define PINBASED_CTLS_ONE_SETTING \
+ (PINBASED_EXTINT_EXITING | \
+ PINBASED_NMI_EXITING | \
+ PINBASED_VIRTUAL_NMI)
+#define PINBASED_CTLS_ZERO_SETTING 0
+
+#define PROCBASED_CTLS_WINDOW_SETTING \
+ (PROCBASED_INT_WINDOW_EXITING | \
+ PROCBASED_NMI_WINDOW_EXITING)
+
+#define PROCBASED_CTLS_ONE_SETTING \
+ (PROCBASED_SECONDARY_CONTROLS | \
+ PROCBASED_IO_EXITING | \
+ PROCBASED_MSR_BITMAPS | \
+ PROCBASED_CTLS_WINDOW_SETTING)
+#define PROCBASED_CTLS_ZERO_SETTING \
+ (PROCBASED_CR3_LOAD_EXITING | \
+ PROCBASED_CR3_STORE_EXITING | \
+ PROCBASED_IO_BITMAPS)
+
+#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
+#define PROCBASED_CTLS2_ZERO_SETTING 0
+
+#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \
+ (VM_EXIT_HOST_LMA | \
+ VM_EXIT_SAVE_EFER | \
+ VM_EXIT_LOAD_EFER)
+
+#define VM_EXIT_CTLS_ONE_SETTING \
+ (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \
+ VM_EXIT_SAVE_PAT | \
+ VM_EXIT_LOAD_PAT)
+#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER
+
+#define VM_ENTRY_CTLS_ONE_SETTING \
+ (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \
+ VM_ENTRY_LOAD_PAT)
+#define VM_ENTRY_CTLS_ZERO_SETTING \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_INTO_SMM | \
+ VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define guest_msr_rw(vmx, msr) \
+ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define HANDLED 1
+#define UNHANDLED 0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+static int vmx_no_patmsr;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+ VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
+static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+ static char reasonbuf[32];
+
+ switch (reason) {
+ case EXIT_REASON_EXCEPTION:
+ return "exception";
+ case EXIT_REASON_EXT_INTR:
+ return "extint";
+ case EXIT_REASON_TRIPLE_FAULT:
+ return "triplefault";
+ case EXIT_REASON_INIT:
+ return "init";
+ case EXIT_REASON_SIPI:
+ return "sipi";
+ case EXIT_REASON_IO_SMI:
+ return "iosmi";
+ case EXIT_REASON_SMI:
+ return "smi";
+ case EXIT_REASON_INTR_WINDOW:
+ return "intrwindow";
+ case EXIT_REASON_NMI_WINDOW:
+ return "nmiwindow";
+ case EXIT_REASON_TASK_SWITCH:
+ return "taskswitch";
+ case EXIT_REASON_CPUID:
+ return "cpuid";
+ case EXIT_REASON_GETSEC:
+ return "getsec";
+ case EXIT_REASON_HLT:
+ return "hlt";
+ case EXIT_REASON_INVD:
+ return "invd";
+ case EXIT_REASON_INVLPG:
+ return "invlpg";
+ case EXIT_REASON_RDPMC:
+ return "rdpmc";
+ case EXIT_REASON_RDTSC:
+ return "rdtsc";
+ case EXIT_REASON_RSM:
+ return "rsm";
+ case EXIT_REASON_VMCALL:
+ return "vmcall";
+ case EXIT_REASON_VMCLEAR:
+ return "vmclear";
+ case EXIT_REASON_VMLAUNCH:
+ return "vmlaunch";
+ case EXIT_REASON_VMPTRLD:
+ return "vmptrld";
+ case EXIT_REASON_VMPTRST:
+ return "vmptrst";
+ case EXIT_REASON_VMREAD:
+ return "vmread";
+ case EXIT_REASON_VMRESUME:
+ return "vmresume";
+ case EXIT_REASON_VMWRITE:
+ return "vmwrite";
+ case EXIT_REASON_VMXOFF:
+ return "vmxoff";
+ case EXIT_REASON_VMXON:
+ return "vmxon";
+ case EXIT_REASON_CR_ACCESS:
+ return "craccess";
+ case EXIT_REASON_DR_ACCESS:
+ return "draccess";
+ case EXIT_REASON_INOUT:
+ return "inout";
+ case EXIT_REASON_RDMSR:
+ return "rdmsr";
+ case EXIT_REASON_WRMSR:
+ return "wrmsr";
+ case EXIT_REASON_INVAL_VMCS:
+ return "invalvmcs";
+ case EXIT_REASON_INVAL_MSR:
+ return "invalmsr";
+ case EXIT_REASON_MWAIT:
+ return "mwait";
+ case EXIT_REASON_MTF:
+ return "mtf";
+ case EXIT_REASON_MONITOR:
+ return "monitor";
+ case EXIT_REASON_PAUSE:
+ return "pause";
+ case EXIT_REASON_MCE:
+ return "mce";
+ case EXIT_REASON_TPR:
+ return "tpr";
+ case EXIT_REASON_APIC:
+ return "apic";
+ case EXIT_REASON_GDTR_IDTR:
+ return "gdtridtr";
+ case EXIT_REASON_LDTR_TR:
+ return "ldtrtr";
+ case EXIT_REASON_EPT_FAULT:
+ return "eptfault";
+ case EXIT_REASON_EPT_MISCONFIG:
+ return "eptmisconfig";
+ case EXIT_REASON_INVEPT:
+ return "invept";
+ case EXIT_REASON_RDTSCP:
+ return "rdtscp";
+ case EXIT_REASON_VMX_PREEMPT:
+ return "vmxpreempt";
+ case EXIT_REASON_INVVPID:
+ return "invvpid";
+ case EXIT_REASON_WBINVD:
+ return "wbinvd";
+ case EXIT_REASON_XSETBV:
+ return "xsetbv";
+ default:
+ snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+ return (reasonbuf);
+ }
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ return "direct";
+ case VMX_RETURN_LONGJMP:
+ return "longjmp";
+ case VMX_RETURN_VMRESUME:
+ return "vmresume";
+ case VMX_RETURN_VMLAUNCH:
+ return "vmlaunch";
+ case VMX_RETURN_AST:
+ return "ast";
+ default:
+ return "unknown";
+ }
+}
+
+#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \
+ VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+ (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ uint64_t host_rip, host_rsp;
+
+ if (vmxctx != &vmx->ctx[vcpu])
+ panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+ vmxctx, &vmx->ctx[vcpu]);
+
+ VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+ VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+ vmx_setjmp_rc2str(rc), rc);
+
+ host_rsp = host_rip = ~0;
+ vmread(VMCS_HOST_RIP, &host_rip);
+ vmread(VMCS_HOST_RSP, &host_rsp);
+ VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+ host_rip, host_rsp);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ return;
+}
+#endif /* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+ return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+ return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+ int cnt;
+
+ static struct msr_entry guest_msrs[] = {
+ { MSR_KGSBASE, 0, 0 },
+ };
+
+ cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+ if (cnt > GUEST_MSR_MAX_ENTRIES)
+ panic("guest msr save area overrun");
+ bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+ *g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+ struct invvpid_desc invvpid_desc = { 0 };
+ struct invept_desc invept_desc = { 0 };
+
+ if (vmxon_enabled[curcpu]) {
+ /*
+ * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+ *
+ * VMXON or VMXOFF are not required to invalidate any TLB
+ * caching structures. This prevents potential retention of
+ * cached information in the TLB between distinct VMX episodes.
+ */
+ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+ invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+ vmxoff();
+ }
+ load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+ smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+ return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+ int error;
+
+ load_cr4(rcr4() | CR4_VMXE);
+
+ *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+ error = vmxon(vmxon_region[curcpu]);
+ if (error == 0)
+ vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+ int error;
+ uint64_t fixed0, fixed1, feature_control;
+ uint32_t tmp;
+
+ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+ if (!(cpu_feature2 & CPUID2_VMX)) {
+ printf("vmx_init: processor does not support VMX operation\n");
+ return (ENXIO);
+ }
+
+ /*
+ * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+ * are set (bits 0 and 2 respectively).
+ */
+ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if ((feature_control & 0x5) != 0x5) {
+ printf("vmx_init: VMX operation disabled by BIOS\n");
+ return (ENXIO);
+ }
+
+ /* Check support for primary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_CTLS_ONE_SETTING,
+ PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired primary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Clear the processor-based ctl bits that are set on demand */
+ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+ /* Check support for secondary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED_CTLS2_ONE_SETTING,
+ PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+ if (error) {
+ printf("vmx_init: processor does not support desired secondary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VPID */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_ENABLE_VPID, 0, &tmp);
+ if (error == 0)
+ procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+ /* Check support for pin-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS,
+ PINBASED_CTLS_ONE_SETTING,
+ PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "pin-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-exit controls */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ /* Try again without the PAT MSR bits */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
+ MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "exit controls\n");
+ return (error);
+ } else {
+ if (bootverbose)
+ printf("vmm: PAT MSR access not supported\n");
+ guest_msr_valid(MSR_PAT);
+ vmx_no_patmsr = 1;
+ }
+ }
+
+ /* Check support for VM-entry controls */
+ if (!vmx_no_patmsr) {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ } else {
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+ MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ }
+
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "entry controls\n");
+ return (error);
+ }
+
+ /*
+ * Check support for optional features by testing them
+ * as individual bits
+ */
+ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_HLT_EXITING, 0,
+ &tmp) == 0);
+
+ cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_PROCBASED_CTLS,
+ PROCBASED_MTF, 0,
+ &tmp) == 0);
+
+ cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_PAUSE_EXITING, 0,
+ &tmp) == 0);
+
+ cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_UNRESTRICTED_GUEST, 0,
+ &tmp) == 0);
+
+ /* Initialize EPT */
+ error = ept_init();
+ if (error) {
+ printf("vmx_init: ept initialization failed (%d)\n", error);
+ return (error);
+ }
+
+ /*
+ * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+ */
+ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+ cr0_ones_mask = fixed0 & fixed1;
+ cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+ /*
+ * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+ * if unrestricted guest execution is allowed.
+ */
+ if (cap_unrestricted_guest)
+ cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+ /*
+ * Do not allow the guest to set CR0_NW or CR0_CD.
+ */
+ cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+ fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+ cr4_ones_mask = fixed0 & fixed1;
+ cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+ /* enable VMX operation */
+ smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+ uint16_t vpid = 0;
+
+ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+ do {
+ vpid = atomic_fetchadd_int(&nextvpid, 1);
+ } while (vpid == 0);
+ }
+
+ return (vpid);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs)
+{
+ int error, mask_ident, shadow_ident;
+ uint64_t mask_value, shadow_value;
+
+ if (which != 0 && which != 4)
+ panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+ if (which == 0) {
+ mask_ident = VMCS_CR0_MASK;
+ mask_value = cr0_ones_mask | cr0_zeros_mask;
+ shadow_ident = VMCS_CR0_SHADOW;
+ shadow_value = cr0_ones_mask;
+ } else {
+ mask_ident = VMCS_CR4_MASK;
+ mask_value = cr4_ones_mask | cr4_zeros_mask;
+ shadow_ident = VMCS_CR4_SHADOW;
+ shadow_value = cr4_ones_mask;
+ }
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value);
+ if (error)
+ return (error);
+
+ error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value);
+ if (error)
+ return (error);
+
+ return (0);
+}
+#define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs))
+#define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+ uint16_t vpid;
+ int i, error, guest_msr_count;
+ struct vmx *vmx;
+
+ vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+ if ((uintptr_t)vmx & PAGE_MASK) {
+ panic("malloc of struct vmx not aligned on %d byte boundary",
+ PAGE_SIZE);
+ }
+ vmx->vm = vm;
+
+ /*
+ * Clean up EPTP-tagged guest physical and combined mappings
+ *
+ * VMX transitions are not required to invalidate any guest physical
+ * mappings. So, it may be possible for stale guest physical mappings
+ * to be present in the processor TLBs.
+ *
+ * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+ */
+ ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+ msr_bitmap_initialize(vmx->msr_bitmap);
+
+ /*
+ * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+ * The guest FSBASE and GSBASE are saved and restored during
+ * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+ * always restored from the vmcs host state area on vm-exit.
+ *
+ * Guest KGSBASE is saved and restored in the guest MSR save area.
+ * Host KGSBASE is restored before returning to userland from the pcb.
+ * There will be a window of time when we are executing in the host
+ * kernel context with a value of KGSBASE from the guest. This is ok
+ * because the value of KGSBASE is inconsequential in kernel context.
+ *
+ * MSR_EFER is saved and restored in the guest VMCS area on a
+ * VM exit and entry respectively. It is also restored from the
+ * host VMCS area on a VM exit.
+ */
+ if (guest_msr_rw(vmx, MSR_GSBASE) ||
+ guest_msr_rw(vmx, MSR_FSBASE) ||
+ guest_msr_rw(vmx, MSR_KGSBASE) ||
+ guest_msr_rw(vmx, MSR_EFER))
+ panic("vmx_vminit: error setting guest msr access");
+
+ /*
+ * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+ * and entry respectively. It is also restored from the host VMCS
+ * area on a VM exit. However, if running on a system with no
+ * MSR_PAT save/restore support, leave access disabled so accesses
+ * will be trapped.
+ */
+ if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+ panic("vmx_vminit: error setting guest pat msr access");
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vmx->vmcs[i].identifier = vmx_revision();
+ error = vmclear(&vmx->vmcs[i]);
+ if (error != 0) {
+ panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+ error, i);
+ }
+
+ vpid = vmx_vpid();
+
+ error = vmcs_set_defaults(&vmx->vmcs[i],
+ (u_long)vmx_longjmp,
+ (u_long)&vmx->ctx[i],
+ vtophys(vmx->pml4ept),
+ pinbased_ctls,
+ procbased_ctls,
+ procbased_ctls2,
+ exit_ctls, entry_ctls,
+ vtophys(vmx->msr_bitmap),
+ vpid);
+
+ if (error != 0)
+ panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+ vmx->cap[i].set = 0;
+ vmx->cap[i].proc_ctls = procbased_ctls;
+
+ vmx->state[i].lastcpu = -1;
+ vmx->state[i].vpid = vpid;
+
+ msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+ error = vmcs_set_msr_save(&vmx->vmcs[i],
+ vtophys(vmx->guest_msrs[i]),
+ guest_msr_count);
+ if (error != 0)
+ panic("vmcs_set_msr_save error %d", error);
+
+ error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr0_shadow %d", error);
+
+ error = vmx_setup_cr4_shadow(&vmx->vmcs[i]);
+ if (error != 0)
+ panic("vmx_setup_cr4_shadow %d", error);
+ }
+
+ return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+ int handled, func;
+
+ func = vmxctx->guest_rax;
+
+ handled = x86_emulate_cpuid(vm, vcpu,
+ (uint32_t*)(&vmxctx->guest_rax),
+ (uint32_t*)(&vmxctx->guest_rbx),
+ (uint32_t*)(&vmxctx->guest_rcx),
+ (uint32_t*)(&vmxctx->guest_rdx));
+ return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+ int handled)
+{
+#ifdef KTR
+ VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+ handled ? "handled" : "unhandled",
+ exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+ int error, lastcpu;
+ struct vmxstate *vmxstate;
+ struct invvpid_desc invvpid_desc = { 0 };
+
+ vmxstate = &vmx->state[vcpu];
+ lastcpu = vmxstate->lastcpu;
+ vmxstate->lastcpu = curcpu;
+
+ if (lastcpu == curcpu) {
+ error = 0;
+ goto done;
+ }
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ if (error != 0)
+ goto done;
+
+ /*
+ * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ *
+ * We do this because this vcpu was executing on a different host
+ * cpu when it last ran. We do not track whether it invalidated
+ * mappings associated with its 'vpid' during that run. So we must
+ * assume that the mappings associated with 'vpid' on 'curcpu' are
+ * stale and invalidate them.
+ *
+ * Note that we incur this penalty only when the scheduler chooses to
+ * move the thread associated with this vcpu between host cpus.
+ *
+ * Note also that this will invalidate mappings tagged with 'vpid'
+ * for "all" EP4TAs.
+ */
+ if (vmxstate->vpid != 0) {
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ }
+done:
+ return (error);
+}
+
+static void
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+ int error;
+
+ error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+ if (error)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+ int error;
+ uint64_t info, interruptibility;
+
+ /* Bail out if no NMI requested */
+ if (!vm_nmi_pending(vmx->vm, vcpu))
+ return (0);
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_nmi: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & nmi_blocking_bits)
+ goto nmiblocked;
+
+ /*
+ * Inject the virtual NMI. The vector must be the NMI IDT entry
+ * or the VMCS entry check will fail.
+ */
+ info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+ info |= IDT_NMI;
+
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+ VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+ /* Clear the request */
+ vm_nmi_clear(vmx->vm, vcpu);
+ return (1);
+
+nmiblocked:
+ /*
+ * Set the NMI Window Exiting execution control so we can inject
+ * the virtual NMI as soon as blocking condition goes away.
+ */
+ vmx_set_nmi_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+ return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+ int error, vector;
+ uint64_t info, rflags, interruptibility;
+
+ const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+ /*
+ * If there is already an interrupt pending then just return.
+ *
+ * This could happen if an interrupt was injected on a prior
+ * VM entry but the actual entry into guest mode was aborted
+ * because of a pending AST.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return;
+
+ /*
+ * NMI injection has priority so deal with those first
+ */
+ if (vmx_inject_nmi(vmx, vcpu))
+ return;
+
+ /* Ask the local apic for a vector to inject */
+ vector = lapic_pending_intr(vmx->vm, vcpu);
+ if (vector < 0)
+ return;
+
+ if (vector < 32 || vector > 255)
+ panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+ /* Check RFLAGS.IF and the interruptibility state of the guest */
+ error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+ if ((rflags & PSL_I) == 0)
+ goto cantinject;
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & HWINTR_BLOCKED)
+ goto cantinject;
+
+ /* Inject the interrupt */
+ info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+ info |= vector;
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+ /* Update the Local APIC ISR */
+ lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+ VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+ return;
+
+cantinject:
+ /*
+ * Set the Interrupt Window Exiting execution control so we can inject
+ * the interrupt as soon as blocking condition goes away.
+ */
+ vmx_set_int_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ int error, cr, vmcs_guest_cr;
+ uint64_t regval, ones_mask, zeros_mask;
+ const struct vmxctx *vmxctx;
+
+ /* We only handle mov to %cr0 or %cr4 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ cr = exitqual & 0xf;
+ if (cr != 0 && cr != 4)
+ return (UNHANDLED);
+
+ vmxctx = &vmx->ctx[vcpu];
+
+ /*
+ * We must use vmwrite() directly here because vmcs_setreg() will
+ * call vmclear(vmcs) as a side-effect which we certainly don't want.
+ */
+ switch ((exitqual >> 8) & 0xf) {
+ case 0:
+ regval = vmxctx->guest_rax;
+ break;
+ case 1:
+ regval = vmxctx->guest_rcx;
+ break;
+ case 2:
+ regval = vmxctx->guest_rdx;
+ break;
+ case 3:
+ regval = vmxctx->guest_rbx;
+ break;
+ case 4:
+ error = vmread(VMCS_GUEST_RSP, &regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: "
+ "error %d reading guest rsp", error);
+ }
+ break;
+ case 5:
+ regval = vmxctx->guest_rbp;
+ break;
+ case 6:
+ regval = vmxctx->guest_rsi;
+ break;
+ case 7:
+ regval = vmxctx->guest_rdi;
+ break;
+ case 8:
+ regval = vmxctx->guest_r8;
+ break;
+ case 9:
+ regval = vmxctx->guest_r9;
+ break;
+ case 10:
+ regval = vmxctx->guest_r10;
+ break;
+ case 11:
+ regval = vmxctx->guest_r11;
+ break;
+ case 12:
+ regval = vmxctx->guest_r12;
+ break;
+ case 13:
+ regval = vmxctx->guest_r13;
+ break;
+ case 14:
+ regval = vmxctx->guest_r14;
+ break;
+ case 15:
+ regval = vmxctx->guest_r15;
+ break;
+ }
+
+ if (cr == 0) {
+ ones_mask = cr0_ones_mask;
+ zeros_mask = cr0_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR0;
+ } else {
+ ones_mask = cr4_ones_mask;
+ zeros_mask = cr4_zeros_mask;
+ vmcs_guest_cr = VMCS_GUEST_CR4;
+ }
+ regval |= ones_mask;
+ regval &= ~zeros_mask;
+ error = vmwrite(vmcs_guest_cr, regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: error %d writing cr%d",
+ error, cr);
+ }
+
+ return (HANDLED);
+}
+
+static int
+vmx_ept_fault(struct vm *vm, int cpu,
+ uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+ uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+{
+ int read, write, error;
+
+ /* EPT violation on an instruction fetch doesn't make sense here */
+ if (ept_qual & EPT_VIOLATION_INST_FETCH)
+ return (UNHANDLED);
+
+ /* EPT violation must be a read fault or a write fault */
+ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+ write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+ if ((read | write) == 0)
+ return (UNHANDLED);
+
+ /*
+ * The EPT violation must have been caused by accessing a
+ * guest-physical address that is a translation of a guest-linear
+ * address.
+ */
+ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+ (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+ return (UNHANDLED);
+ }
+
+ /* Fetch, decode and emulate the faulting instruction */
+ if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
+ return (UNHANDLED);
+
+ if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
+ return (UNHANDLED);
+
+ /*
+ * Check if this is a local apic access
+ */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+ return (UNHANDLED);
+
+ error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ return (error ? UNHANDLED : HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+ int error, handled;
+ struct vmcs *vmcs;
+ struct vmxctx *vmxctx;
+ uint32_t eax, ecx, edx;
+ uint64_t qual, gla, gpa, cr3, intr_info;
+
+ handled = 0;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ qual = vmexit->u.vmx.exit_qualification;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+ switch (vmexit->u.vmx.exit_reason) {
+ case EXIT_REASON_CR_ACCESS:
+ handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+ break;
+ case EXIT_REASON_RDMSR:
+ ecx = vmxctx->guest_rcx;
+ error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_RDMSR;
+ vmexit->u.msr.code = ecx;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_WRMSR:
+ eax = vmxctx->guest_rax;
+ ecx = vmxctx->guest_rcx;
+ edx = vmxctx->guest_rdx;
+ error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+ (uint64_t)edx << 32 | eax);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_WRMSR;
+ vmexit->u.msr.code = ecx;
+ vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+ } else
+ handled = 1;
+ break;
+ case EXIT_REASON_HLT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+ /*
+ * If there is an event waiting to be injected then there is
+ * no need to 'hlt'.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
+ if (error)
+ panic("vmx_exit_process: vmread(intrinfo) %d", error);
+
+ if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
+ handled = 1;
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
+ } else
+ vmexit->exitcode = VM_EXITCODE_HLT;
+ break;
+ case EXIT_REASON_MTF:
+ vmexit->exitcode = VM_EXITCODE_MTRAP;
+ break;
+ case EXIT_REASON_PAUSE:
+ vmexit->exitcode = VM_EXITCODE_PAUSE;
+ break;
+ case EXIT_REASON_INTR_WINDOW:
+ vmx_clear_int_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+ /* FALLTHRU */
+ case EXIT_REASON_EXT_INTR:
+ /*
+ * External interrupts serve only to cause VM exits and allow
+ * the host interrupt handler to run.
+ *
+ * If this external interrupt triggers a virtual interrupt
+ * to a VM, then that state will be recorded by the
+ * host interrupt handler in the VM's softc. We will inject
+ * this virtual interrupt during the subsequent VM enter.
+ */
+
+ /*
+ * This is special. We want to treat this as an 'handled'
+ * VM-exit but not increment the instruction pointer.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+ return (1);
+ case EXIT_REASON_NMI_WINDOW:
+ /* Exit to allow the pending virtual NMI to be injected */
+ vmx_clear_nmi_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+ return (1);
+ case EXIT_REASON_INOUT:
+ vmexit->exitcode = VM_EXITCODE_INOUT;
+ vmexit->u.inout.bytes = (qual & 0x7) + 1;
+ vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+ vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+ vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+ vmexit->u.inout.port = (uint16_t)(qual >> 16);
+ vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+ break;
+ case EXIT_REASON_CPUID:
+ handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ gla = vmcs_gla();
+ gpa = vmcs_gpa();
+ cr3 = vmcs_guest_cr3();
+ handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+ vmexit->rip, vmexit->inst_length,
+ cr3, qual, &vmexit->u.paging.vie);
+ if (!handled) {
+ vmexit->exitcode = VM_EXITCODE_PAGING;
+ vmexit->u.paging.gpa = gpa;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (handled) {
+ /*
+ * It is possible that control is returned to userland
+ * even though we were able to handle the VM exit in the
+ * kernel.
+ *
+ * In such a case we want to make sure that the userland
+ * restarts guest execution at the instruction *after*
+ * the one we just processed. Therefore we update the
+ * guest rip in the VMCS and in 'vmexit'.
+ */
+ vm_exit_update_rip(vmexit);
+ vmexit->rip += vmexit->inst_length;
+ vmexit->inst_length = 0;
+
+ /*
+ * Special case for spinning up an AP - exit to userspace to
+ * give the controlling process a chance to intercept and
+ * spin up a thread for the AP.
+ */
+ if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+ handled = 0;
+ } else {
+ if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+ /*
+ * If this VM exit was not claimed by anybody then
+ * treat it as a generic VMX exit.
+ */
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.error = 0;
+ } else {
+ /*
+ * The exitcode and collateral have been populated.
+ * The VM exit will be processed further in userland.
+ */
+ }
+ }
+ return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+ int error, vie, rc, handled, astpending;
+ uint32_t exit_reason;
+ struct vmx *vmx;
+ struct vmxctx *vmxctx;
+ struct vmcs *vmcs;
+ struct vm_exit *vmexit;
+
+ vmx = arg;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ vmxctx->launched = 0;
+
+ astpending = 0;
+ vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+ /*
+ * XXX Can we avoid doing this every time we do a vm run?
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * XXX
+ * We do this every time because we may setup the virtual machine
+ * from a different process than the one that actually runs it.
+ *
+ * If the life of a virtual machine was spent entirely in the context
+ * of a single process we could do this once in vmcs_set_defaults().
+ */
+ if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+ panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+ if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+ if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+ panic("vmx_run: error %d setting up pcpu defaults", error);
+
+ do {
+ lapic_timer_tick(vmx->vm, vcpu);
+ vmx_inject_interrupts(vmx, vcpu);
+ vmx_run_trace(vmx, vcpu);
+ rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+ vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ if (vmxctx->launched == 0) {
+ vmxctx->launched = 1;
+ vmx_launch(vmxctx);
+ } else
+ vmx_resume(vmxctx);
+ panic("vmx_launch/resume should not return");
+ break;
+ case VMX_RETURN_LONGJMP:
+ break; /* vm exit */
+ case VMX_RETURN_AST:
+ astpending = 1;
+ break;
+ case VMX_RETURN_VMRESUME:
+ vie = vmcs_instruction_error();
+ if (vmxctx->launch_error == VM_FAIL_INVALID ||
+ vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+ printf("vmresume error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+ goto err_exit;
+ }
+ vmx_launch(vmxctx); /* try to launch the guest */
+ panic("vmx_launch should not return");
+ break;
+ case VMX_RETURN_VMLAUNCH:
+ vie = vmcs_instruction_error();
+#if 1
+ printf("vmlaunch error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+#endif
+ goto err_exit;
+ default:
+ panic("vmx_setjmp returned %d", rc);
+ }
+
+ /* enable interrupts */
+ enable_intr();
+
+ /* collect some basic information for VM exit processing */
+ vmexit->rip = rip = vmcs_guest_rip();
+ vmexit->inst_length = vmexit_instruction_length();
+ vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+ vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+ if (astpending) {
+ handled = 1;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+ vmx_astpending_trace(vmx, vcpu, rip);
+ break;
+ }
+
+ handled = vmx_exit_process(vmx, vcpu, vmexit);
+ vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+ } while (handled);
+
+ /*
+ * If a VM exit has been handled then the exitcode must be BOGUS
+ * If a VM exit is not handled then the exitcode must not be BOGUS
+ */
+ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+ (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+ panic("Mismatch between handled (%d) and exitcode (%d)",
+ handled, vmexit->exitcode);
+ }
+
+ VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+ /*
+ * XXX
+ * We need to do this to ensure that any VMCS state cached by the
+ * processor is flushed to memory. We need to do this in case the
+ * VM moves to a different cpu the next time it runs.
+ *
+ * Can we avoid doing this?
+ */
+ VMCLEAR(vmcs);
+ return (0);
+
+err_exit:
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.exit_reason = (uint32_t)-1;
+ vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+ vmexit->u.vmx.error = vie;
+ VMCLEAR(vmcs);
+ return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+ int error;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXXSMP we also need to clear the VMCS active on the other vcpus.
+ */
+ error = vmclear(&vmx->vmcs[0]);
+ if (error != 0)
+ panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+ ept_vmcleanup(vmx);
+ free(vmx, M_VMX);
+
+ return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_RAX:
+ return (&vmxctx->guest_rax);
+ case VM_REG_GUEST_RBX:
+ return (&vmxctx->guest_rbx);
+ case VM_REG_GUEST_RCX:
+ return (&vmxctx->guest_rcx);
+ case VM_REG_GUEST_RDX:
+ return (&vmxctx->guest_rdx);
+ case VM_REG_GUEST_RSI:
+ return (&vmxctx->guest_rsi);
+ case VM_REG_GUEST_RDI:
+ return (&vmxctx->guest_rdi);
+ case VM_REG_GUEST_RBP:
+ return (&vmxctx->guest_rbp);
+ case VM_REG_GUEST_R8:
+ return (&vmxctx->guest_r8);
+ case VM_REG_GUEST_R9:
+ return (&vmxctx->guest_r9);
+ case VM_REG_GUEST_R10:
+ return (&vmxctx->guest_r10);
+ case VM_REG_GUEST_R11:
+ return (&vmxctx->guest_r11);
+ case VM_REG_GUEST_R12:
+ return (&vmxctx->guest_r12);
+ case VM_REG_GUEST_R13:
+ return (&vmxctx->guest_r13);
+ case VM_REG_GUEST_R14:
+ return (&vmxctx->guest_r14);
+ case VM_REG_GUEST_R15:
+ return (&vmxctx->guest_r15);
+ default:
+ break;
+ }
+ return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *retval = *regp;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *regp = val;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+ struct vmx *vmx = arg;
+
+ if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ uint64_t ctls;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXX Allow caller to set contents of the guest registers saved in
+ * the 'vmxctx' even though the vcpu might be running. We need this
+ * specifically to support the rdmsr emulation that will set the
+ * %eax and %edx registers during vm exit processing.
+ */
+ if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu))
+ panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+ if (error == 0) {
+ /*
+ * If the "load EFER" VM-entry control is 1 then the
+ * value of EFER.LMA must be identical to "IA-32e mode guest"
+ * bit in the VM-entry control.
+ */
+ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+ (reg == VM_REG_GUEST_EFER)) {
+ vmcs_getreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+ if (val & EFER_LMA)
+ ctls |= VM_ENTRY_GUEST_LMA;
+ else
+ ctls &= ~VM_ENTRY_GUEST_LMA;
+ vmcs_setreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+ }
+ }
+
+ return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+ int code_valid)
+{
+ int error;
+ uint64_t info;
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+ static uint32_t type_map[VM_EVENT_MAX] = {
+ 0x1, /* VM_EVENT_NONE */
+ 0x0, /* VM_HW_INTR */
+ 0x2, /* VM_NMI */
+ 0x3, /* VM_HW_EXCEPTION */
+ 0x4, /* VM_SW_INTR */
+ 0x5, /* VM_PRIV_SW_EXCEPTION */
+ 0x6, /* VM_SW_EXCEPTION */
+ };
+
+ /*
+ * If there is already an exception pending to be delivered to the
+ * vcpu then just return.
+ */
+ error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
+ if (error)
+ return (error);
+
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return (EAGAIN);
+
+ info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+ info |= VMCS_INTERRUPTION_INFO_VALID;
+ error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+ if (error != 0)
+ return (error);
+
+ if (code_valid) {
+ error = vmcs_setreg(vmcs,
+ VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+ code);
+ }
+ return (error);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+ struct vmx *vmx = arg;
+ int vcap;
+ int ret;
+
+ ret = ENOENT;
+
+ vcap = vmx->cap[vcpu].set;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit)
+ ret = 0;
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit)
+ ret = 0;
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap)
+ ret = 0;
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest)
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (ret == 0)
+ *retval = (vcap & (1 << type)) ? 1 : 0;
+
+ return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+ uint32_t baseval;
+ uint32_t *pptr;
+ int error;
+ int flag;
+ int reg;
+ int retval;
+
+ retval = ENOENT;
+ pptr = NULL;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_HLT_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_MTF;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_PAUSE_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest) {
+ retval = 0;
+ baseval = procbased_ctls2;
+ flag = PROCBASED2_UNRESTRICTED_GUEST;
+ reg = VMCS_SEC_PROC_BASED_CTLS;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retval == 0) {
+ if (val) {
+ baseval |= flag;
+ } else {
+ baseval &= ~flag;
+ }
+ VMPTRLD(vmcs);
+ error = vmwrite(reg, baseval);
+ VMCLEAR(vmcs);
+
+ if (error) {
+ retval = error;
+ } else {
+ /*
+ * Update optional stored flags, and record
+ * setting
+ */
+ if (pptr != NULL) {
+ *pptr = baseval;
+ }
+
+ if (val) {
+ vmx->cap[vcpu].set |= (1 << type);
+ } else {
+ vmx->cap[vcpu].set &= ~(1 << type);
+ }
+ }
+ }
+
+ return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+ vmx_init,
+ vmx_cleanup,
+ vmx_vminit,
+ vmx_run,
+ vmx_vmcleanup,
+ ept_vmmmap_set,
+ ept_vmmmap_get,
+ vmx_getreg,
+ vmx_setreg,
+ vmx_getdesc,
+ vmx_setdesc,
+ vmx_inject,
+ vmx_getcap,
+ vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..c7cd567
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define _VMX_H_
+
+#include "vmcs.h"
+
+#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
+
+struct vmxctx {
+ register_t tmpstk[32]; /* vmx_return() stack */
+ register_t tmpstktop;
+
+ register_t guest_rdi; /* Guest state */
+ register_t guest_rsi;
+ register_t guest_rdx;
+ register_t guest_rcx;
+ register_t guest_r8;
+ register_t guest_r9;
+ register_t guest_rax;
+ register_t guest_rbx;
+ register_t guest_rbp;
+ register_t guest_r10;
+ register_t guest_r11;
+ register_t guest_r12;
+ register_t guest_r13;
+ register_t guest_r14;
+ register_t guest_r15;
+ register_t guest_cr2;
+
+ register_t host_r15; /* Host state */
+ register_t host_r14;
+ register_t host_r13;
+ register_t host_r12;
+ register_t host_rbp;
+ register_t host_rsp;
+ register_t host_rbx;
+ register_t host_rip;
+ /*
+ * XXX todo debug registers and fpu state
+ */
+
+ int launched; /* vmcs launch state */
+ int launch_error;
+};
+
+struct vmxcap {
+ int set;
+ uint32_t proc_ctls;
+};
+
+struct vmxstate {
+ int lastcpu; /* host cpu that this 'vcpu' last ran on */
+ uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+ pml4_entry_t pml4ept[NPML4EPG];
+ struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
+ char msr_bitmap[PAGE_SIZE];
+ struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+ struct vmxctx ctx[VM_MAXCPU];
+ struct vmxcap cap[VM_MAXCPU];
+ struct vmxstate state[VM_MAXCPU];
+ struct vm *vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define VMX_RETURN_DIRECT 0
+#define VMX_RETURN_LONGJMP 1
+#define VMX_RETURN_VMRESUME 2
+#define VMX_RETURN_VMLAUNCH 3
+#define VMX_RETURN_AST 4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int vmx_setjmp(struct vmxctx *ctx);
+void vmx_longjmp(void); /* returns via vmx_setjmp */
+void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+
+u_long vmx_fix_cr0(u_long cr0);
+u_long vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define _VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define PINBASED_EXTINT_EXITING (1 << 0)
+#define PINBASED_NMI_EXITING (1 << 3)
+#define PINBASED_VIRTUAL_NMI (1 << 5)
+#define PINBASED_PREMPTION_TIMER (1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
+#define PROCBASED_TSC_OFFSET (1 << 3)
+#define PROCBASED_HLT_EXITING (1 << 7)
+#define PROCBASED_INVLPG_EXITING (1 << 9)
+#define PROCBASED_MWAIT_EXITING (1 << 10)
+#define PROCBASED_RDPMC_EXITING (1 << 11)
+#define PROCBASED_RDTSC_EXITING (1 << 12)
+#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
+#define PROCBASED_CR3_STORE_EXITING (1 << 16)
+#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
+#define PROCBASED_CR8_STORE_EXITING (1 << 20)
+#define PROCBASED_USE_TPR_SHADOW (1 << 21)
+#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
+#define PROCBASED_MOV_DR_EXITING (1 << 23)
+#define PROCBASED_IO_EXITING (1 << 24)
+#define PROCBASED_IO_BITMAPS (1 << 25)
+#define PROCBASED_MTF (1 << 27)
+#define PROCBASED_MSR_BITMAPS (1 << 28)
+#define PROCBASED_MONITOR_EXITING (1 << 29)
+#define PROCBASED_PAUSE_EXITING (1 << 30)
+#define PROCBASED_SECONDARY_CONTROLS (1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
+#define PROCBASED2_ENABLE_EPT (1 << 1)
+#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
+#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
+#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
+#define PROCBASED2_ENABLE_VPID (1 << 5)
+#define PROCBASED2_WBINVD_EXITING (1 << 6)
+#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
+#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
+
+/* VM Exit Controls */
+#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
+#define VM_EXIT_HOST_LMA (1 << 9)
+#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
+#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
+#define VM_EXIT_SAVE_PAT (1 << 18)
+#define VM_EXIT_LOAD_PAT (1 << 19)
+#define VM_EXIT_SAVE_EFER (1 << 20)
+#define VM_EXIT_LOAD_EFER (1 << 21)
+#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
+
+/* VM Entry Controls */
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
+#define VM_ENTRY_GUEST_LMA (1 << 9)
+#define VM_ENTRY_INTO_SMM (1 << 10)
+#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
+#define VM_ENTRY_LOAD_PAT (1 << 14)
+#define VM_ENTRY_LOAD_EFER (1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..2e66443
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CPUFUNC_H_
+#define _VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ * error
+ * VMsucceed 0
+ * VMFailInvalid 1
+ * VMFailValid 2 see also VMCS VM-Instruction Error Field
+ */
+#define VM_SUCCESS 0
+#define VM_FAIL_INVALID 1
+#define VM_FAIL_VALID 2
+#define VMX_SET_ERROR_CODE \
+ " jnc 1f;" \
+ " mov $1, %[error];" /* CF: error = 1 */ \
+ " jmp 3f;" \
+ "1: jnz 2f;" \
+ " mov $2, %[error];" /* ZF: error = 2 */ \
+ " jmp 3f;" \
+ "2: mov $0, %[error];" \
+ "3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(region);
+ __asm __volatile("vmxon %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+
+ return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmclear %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+ __asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+ __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmptrld %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+ int error;
+
+ __asm __volatile("vmwrite %[val], %[reg];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [val] "r" (val), [reg] "r" (reg)
+ : "memory");
+
+ return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+ int error;
+
+ __asm __volatile("vmread %[r], %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [r] "r" (r), [addr] "m" (*addr)
+ : "memory");
+
+ return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+ int err;
+
+ err = vmclear(vmcs);
+ if (err != 0)
+ panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+ critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+ int err;
+
+ critical_enter();
+
+ err = vmptrld(vmcs);
+ if (err != 0)
+ panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define INVVPID_TYPE_ADDRESS 0UL
+#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
+#define INVVPID_TYPE_ALL_CONTEXTS 2UL
+
+struct invvpid_desc {
+ uint16_t vpid;
+ uint16_t _res1;
+ uint32_t _res2;
+ uint64_t linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+ int error;
+
+ __asm __volatile("invvpid %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invvpid error %d", error);
+}
+
+#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
+#define INVEPT_TYPE_ALL_CONTEXTS 2UL
+struct invept_desc {
+ uint64_t eptp;
+ uint64_t _res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+ int error;
+
+ __asm __volatile("invept %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..823a05d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS, VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
+ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..2aba63c
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+ if (msr_val & (1UL << (bitpos + 32)))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+ if ((msr_val & (1UL << bitpos)) == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+ return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval)
+{
+ int i;
+ uint64_t val, trueval;
+ boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+ /* We cannot ask the same bit to be set to both '1' and '0' */
+ if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+ return (EINVAL);
+
+ if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+ true_ctls_avail = TRUE;
+ else
+ true_ctls_avail = FALSE;
+
+ val = rdmsr(ctl_reg);
+ if (true_ctls_avail)
+ trueval = rdmsr(true_ctl_reg); /* step c */
+ else
+ trueval = val; /* step a */
+
+ for (i = 0; i < 32; i++) {
+ one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+ zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+ KASSERT(one_allowed || zero_allowed,
+ ("invalid zero/one setting for bit %d of ctl 0x%0x, "
+ "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+ if (zero_allowed && !one_allowed) { /* b(i),c(i) */
+ if (ones_mask & (1 << i))
+ return (EINVAL);
+ *retval &= ~(1 << i);
+ } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
+ if (zeros_mask & (1 << i))
+ return (EINVAL);
+ *retval |= 1 << i;
+ } else {
+ if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
+ *retval &= ~(1 << i);
+ else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+ *retval |= 1 << i;
+ else if (!true_ctls_avail)
+ *retval &= ~(1 << i); /* b(iii) */
+ else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+ *retval &= ~(1 << i);
+ else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+ *retval |= 1 << i;
+ else {
+ panic("vmx_set_ctlreg: unable to determine "
+ "correct value of ctl bit %d for msr "
+ "0x%0x and true msr 0x%0x", i, ctl_reg,
+ true_ctl_reg);
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+ memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+ int byte, bit;
+
+ if (msr <= 0x00001FFF)
+ byte = msr / 8;
+ else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+ byte = 1024 + (msr - 0xC0000000) / 8;
+ else
+ return (EINVAL);
+
+ bit = msr & 0x7;
+
+ if (access & MSR_BITMAP_ACCESS_READ)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ byte += 2048;
+ if (access & MSR_BITMAP_ACCESS_WRITE)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define _VMX_MSR_H_
+
+#define MSR_VMX_BASIC 0x480
+#define MSR_VMX_EPT_VPID_CAP 0x48C
+
+#define MSR_VMX_PROCBASED_CTLS 0x482
+#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
+
+#define MSR_VMX_PINBASED_CTLS 0x481
+#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
+
+#define MSR_VMX_PROCBASED_CTLS2 0x48B
+
+#define MSR_VMX_EXIT_CTLS 0x483
+#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
+
+#define MSR_VMX_ENTRY_CTLS 0x484
+#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
+
+#define MSR_VMX_CR0_FIXED0 0x486
+#define MSR_VMX_CR0_FIXED1 0x487
+
+#define MSR_VMX_CR4_FIXED0 0x488
+#define MSR_VMX_CR4_FIXED1 0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define MSR_BITMAP_ACCESS_NONE 0x0
+#define MSR_BITMAP_ACCESS_READ 0x1
+#define MSR_BITMAP_ACCESS_WRITE 0x2
+#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void msr_bitmap_initialize(char *bitmap);
+int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4ba582a
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define VMX_DISABLE_INTERRUPTS cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#define VMX_CHECK_AST \
+ movq PCPU(CURTHREAD),%rax; \
+ testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \
+ je 9f; \
+ movq $VMX_RETURN_AST,%rsi; \
+ movq %rdi,%rsp; \
+ addq $VMXCTX_TMPSTKTOP,%rsp; \
+ callq vmx_return; \
+9:
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define VMX_GUEST_RESTORE \
+ movq %rdi,%rsp; \
+ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
+ movq %rsi,%cr2; \
+ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
+ movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
+ movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
+ movq VMXCTX_GUEST_R8(%rdi),%r8; \
+ movq VMXCTX_GUEST_R9(%rdi),%r9; \
+ movq VMXCTX_GUEST_RAX(%rdi),%rax; \
+ movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
+ movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
+ movq VMXCTX_GUEST_R10(%rdi),%r10; \
+ movq VMXCTX_GUEST_R11(%rdi),%r11; \
+ movq VMXCTX_GUEST_R12(%rdi),%r12; \
+ movq VMXCTX_GUEST_R13(%rdi),%r13; \
+ movq VMXCTX_GUEST_R14(%rdi),%r14; \
+ movq VMXCTX_GUEST_R15(%rdi),%r15; \
+ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define VM_INSTRUCTION_ERROR(reg) \
+ jnc 1f; \
+ movl $VM_FAIL_INVALID,reg; /* CF is set */ \
+ jmp 3f; \
+1: jnz 2f; \
+ movl $VM_FAIL_VALID,reg; /* ZF is set */ \
+ jmp 3f; \
+2: movl $VM_SUCCESS,reg; \
+3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+ .text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+ movq (%rsp),%rax /* return address */
+ movq %r15,VMXCTX_HOST_R15(%rdi)
+ movq %r14,VMXCTX_HOST_R14(%rdi)
+ movq %r13,VMXCTX_HOST_R13(%rdi)
+ movq %r12,VMXCTX_HOST_R12(%rdi)
+ movq %rbp,VMXCTX_HOST_RBP(%rdi)
+ movq %rsp,VMXCTX_HOST_RSP(%rdi)
+ movq %rbx,VMXCTX_HOST_RBX(%rdi)
+ movq %rax,VMXCTX_HOST_RIP(%rdi)
+
+ /*
+ * XXX save host debug registers
+ */
+ movl $VMX_RETURN_DIRECT,%eax
+ ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+ /* Restore host context. */
+ movq VMXCTX_HOST_R15(%rdi),%r15
+ movq VMXCTX_HOST_R14(%rdi),%r14
+ movq VMXCTX_HOST_R13(%rdi),%r13
+ movq VMXCTX_HOST_R12(%rdi),%r12
+ movq VMXCTX_HOST_RBP(%rdi),%rbp
+ movq VMXCTX_HOST_RSP(%rdi),%rsp
+ movq VMXCTX_HOST_RBX(%rdi),%rbx
+ movq VMXCTX_HOST_RIP(%rdi),%rax
+ movq %rax,(%rsp) /* return address */
+
+ /*
+ * XXX restore host debug registers
+ */
+ movl %esi,%eax
+ ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+ /*
+ * Save guest state that is not automatically saved in the vmcs.
+ */
+ movq %rdi,VMXCTX_GUEST_RDI(%rsp)
+ movq %rsi,VMXCTX_GUEST_RSI(%rsp)
+ movq %rdx,VMXCTX_GUEST_RDX(%rsp)
+ movq %rcx,VMXCTX_GUEST_RCX(%rsp)
+ movq %r8,VMXCTX_GUEST_R8(%rsp)
+ movq %r9,VMXCTX_GUEST_R9(%rsp)
+ movq %rax,VMXCTX_GUEST_RAX(%rsp)
+ movq %rbx,VMXCTX_GUEST_RBX(%rsp)
+ movq %rbp,VMXCTX_GUEST_RBP(%rsp)
+ movq %r10,VMXCTX_GUEST_R10(%rsp)
+ movq %r11,VMXCTX_GUEST_R11(%rsp)
+ movq %r12,VMXCTX_GUEST_R12(%rsp)
+ movq %r13,VMXCTX_GUEST_R13(%rsp)
+ movq %r14,VMXCTX_GUEST_R14(%rsp)
+ movq %r15,VMXCTX_GUEST_R15(%rsp)
+
+ movq %cr2,%rdi
+ movq %rdi,VMXCTX_GUEST_CR2(%rsp)
+
+ movq %rsp,%rdi
+ movq $VMX_RETURN_LONGJMP,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmresume
+
+ /*
+ * Capture the reason why vmresume failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMRESUME,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmlaunch
+
+ /*
+ * Capture the reason why vmlaunch failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMLAUNCH,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..ef0e9bc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,677 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+ volatile uint32_t version;
+ volatile uint32_t res0;
+ volatile uint64_t cap;
+ volatile uint64_t ext_cap;
+ volatile uint32_t gcr;
+ volatile uint32_t gsr;
+ volatile uint64_t rta;
+ volatile uint64_t ccr;
+};
+
+#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
+#define VTD_CAP_ND(cap) ((cap) & 0x7)
+#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
+#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
+#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
+
+#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
+#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
+
+#define VTD_GCR_WBF (1 << 27)
+#define VTD_GCR_SRTP (1 << 30)
+#define VTD_GCR_TE (1 << 31)
+
+#define VTD_GSR_WBFS (1 << 27)
+#define VTD_GSR_RTPS (1 << 30)
+#define VTD_GSR_TES (1 << 31)
+
+#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
+#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
+
+#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
+#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
+#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
+#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
+#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
+#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
+#define VTD_IIR_DOMAIN_P 32
+
+#define VTD_ROOT_PRESENT 0x1
+#define VTD_CTX_PRESENT 0x1
+#define VTD_CTX_TT_ALL (1UL << 2)
+
+#define VTD_PTE_RD (1UL << 0)
+#define VTD_PTE_WR (1UL << 1)
+#define VTD_PTE_SUPERPAGE (1UL << 7)
+#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
+
+struct domain {
+ uint64_t *ptp; /* first level page table page */
+ int pt_levels; /* number of page table levels */
+ int addrwidth; /* 'AW' field in context entry */
+ int spsmask; /* supported super page sizes */
+ u_int id; /* domain id */
+ vm_paddr_t maxaddr; /* highest address to be mapped */
+ SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define DRHD_MAX_UNITS 8
+static int drhd_num;
+static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
+static int max_domains;
+typedef int (*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+ int units, nlbus;
+ uint16_t did, vid;
+ uint32_t miscsts, vtbar;
+
+ const int bus = 0;
+ const int slot = 20;
+ const int func = 0;
+
+ units = 0;
+
+ vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+ did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+ if (vid != 0x8086 || did != 0x342E)
+ goto done;
+
+ /*
+ * Check if this is a dual IOH configuration.
+ */
+ miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+ if (miscsts & (1 << 25))
+ nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+ else
+ nlbus = -1;
+
+ vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in legacy IOH is disabled!\n");
+
+ if (nlbus != -1) {
+ vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in non-legacy IOH is disabled!\n");
+ }
+done:
+ return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+ tylersburg_vtd_ident,
+ NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+ int nd;
+
+ nd = VTD_CAP_ND(vtdmap->cap);
+
+ switch (nd) {
+ case 0:
+ return (16);
+ case 1:
+ return (64);
+ case 2:
+ return (256);
+ case 3:
+ return (1024);
+ case 4:
+ return (4 * 1024);
+ case 5:
+ return (16 * 1024);
+ case 6:
+ return (64 * 1024);
+ default:
+ panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+ }
+}
+
+static u_int
+domain_id(void)
+{
+ u_int id;
+ struct domain *dom;
+
+ /* Skip domain id 0 - it is reserved when Caching Mode field is set */
+ for (id = 1; id < max_domains; id++) {
+ SLIST_FOREACH(dom, &domhead, next) {
+ if (dom->id == id)
+ break;
+ }
+ if (dom == NULL)
+ break; /* found it */
+ }
+
+ if (id >= max_domains)
+ panic("domain ids exhausted");
+
+ return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+ if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+ pmap_invalidate_cache();
+
+ if (VTD_CAP_RWBF(vtdmap->cap)) {
+ vtdmap->gcr = VTD_GCR_WBF;
+ while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+ ;
+ }
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+ vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+ while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+ ;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+ int offset;
+ volatile uint64_t *iotlb_reg, val;
+
+ vtd_wbflush(vtdmap);
+
+ offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+ iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+
+ *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+ VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+ while (1) {
+ val = *iotlb_reg;
+ if ((val & VTD_IIR_IVT) == 0)
+ break;
+ }
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = VTD_GCR_TE;
+ while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+ ;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = 0;
+ while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+ ;
+}
+
+static int
+vtd_init(void)
+{
+ int i, units;
+ struct vtdmap *vtdmap;
+ vm_paddr_t ctx_paddr;
+
+ for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+ units = (*drhd_ident_funcs[i])();
+ if (units > 0)
+ break;
+ }
+
+ if (units <= 0)
+ return (ENXIO);
+
+ drhd_num = units;
+ vtdmap = vtdmaps[0];
+
+ if (VTD_CAP_CM(vtdmap->cap) != 0)
+ panic("vtd_init: invalid caching mode");
+
+ max_domains = vtd_max_domains(vtdmap);
+
+ /*
+ * Set up the root-table to point to the context-entry tables
+ */
+ for (i = 0; i < 256; i++) {
+ ctx_paddr = vtophys(ctx_tables[i]);
+ if (ctx_paddr & PAGE_MASK)
+ panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+ root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+ }
+
+ return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_wbflush(vtdmap);
+
+ /* Update the root table address */
+ vtdmap->rta = vtophys(root_table);
+ vtdmap->gcr = VTD_GCR_SRTP;
+ while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+ ;
+
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+
+ vtd_translation_enable(vtdmap);
+ }
+}
+
+static void
+vtd_disable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_translation_disable(vtdmap);
+ }
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+ int idx;
+ uint64_t *ctxp;
+ struct domain *dom = arg;
+ vm_paddr_t pt_paddr;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ vtdmap = vtdmaps[0];
+ ctxp = ctx_tables[bus];
+ pt_paddr = vtophys(dom->ptp);
+ idx = (slot << 3 | func) * 2;
+
+ if (ctxp[idx] & VTD_CTX_PRESENT) {
+ panic("vtd_add_device: device %d/%d/%d is already owned by "
+ "domain %d", bus, slot, func,
+ (uint16_t)(ctxp[idx + 1] >> 8));
+ }
+
+ /*
+ * Order is important. The 'present' bit is set only after all fields
+ * of the context pointer are initialized.
+ */
+ ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+ if (VTD_ECAP_DI(vtdmap->ext_cap))
+ ctxp[idx] = VTD_CTX_TT_ALL;
+ else
+ ctxp[idx] = 0;
+
+ ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+ /*
+ * 'Not Present' entries are not cached in either the Context Cache
+ * or in the IOTLB, so there is no need to invalidate either of them.
+ */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+ int i, idx;
+ uint64_t *ctxp;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ ctxp = ctx_tables[bus];
+ idx = (slot << 3 | func) * 2;
+
+ /*
+ * Order is important. The 'present' bit is must be cleared first.
+ */
+ ctxp[idx] = 0;
+ ctxp[idx + 1] = 0;
+
+ /*
+ * Invalidate the Context Cache and the IOTLB.
+ *
+ * XXX use device-selective invalidation for Context Cache
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+#define CREATE_MAPPING 0
+#define REMOVE_MAPPING 1
+
+static uint64_t
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+ int remove)
+{
+ struct domain *dom;
+ int i, spshift, ptpshift, ptpindex, nlevels;
+ uint64_t spsize, *ptp;
+
+ dom = arg;
+ ptpindex = 0;
+ ptpshift = 0;
+
+ if (gpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+ if (hpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+ if (len & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - supported super page size
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = 48;
+ for (i = 3; i >= 0; i--) {
+ spsize = 1UL << spshift;
+ if ((dom->spsmask & (1 << i)) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ (len >= spsize)) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ ptp = dom->ptp;
+ nlevels = dom->pt_levels;
+ while (--nlevels >= 0) {
+ ptpshift = 12 + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift) {
+ break;
+ }
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create a downstream page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+ }
+
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+ panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+ /*
+ * Update the 'gpa' -> 'hpa' mapping
+ */
+ if (remove) {
+ ptp[ptpindex] = 0;
+ } else {
+ ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+ return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ /*
+ * Invalidate the IOTLB.
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+ struct domain *dom;
+ vm_paddr_t addr;
+ int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+ struct vtdmap *vtdmap;
+
+ if (drhd_num <= 0)
+ panic("vtd_create_domain: no dma remapping hardware available");
+
+ vtdmap = vtdmaps[0];
+
+ /*
+ * Calculate AGAW.
+ * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+ */
+ addr = 0;
+ for (gaw = 0; addr < maxaddr; gaw++)
+ addr = 1ULL << gaw;
+
+ res = (gaw - 12) % 9;
+ if (res == 0)
+ agaw = gaw;
+ else
+ agaw = gaw + 9 - res;
+
+ if (agaw > 64)
+ agaw = 64;
+
+ /*
+ * Select the smallest Supported AGAW and the corresponding number
+ * of page table levels.
+ */
+ pt_levels = 2;
+ sagaw = 30;
+ addrwidth = 0;
+ tmp = VTD_CAP_SAGAW(vtdmap->cap);
+ for (i = 0; i < 5; i++) {
+ if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+ break;
+ pt_levels++;
+ addrwidth++;
+ sagaw += 9;
+ if (sagaw > 64)
+ sagaw = 64;
+ }
+
+ if (i >= 5) {
+ panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+ VTD_CAP_SAGAW(vtdmap->cap), agaw);
+ }
+
+ dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+ dom->pt_levels = pt_levels;
+ dom->addrwidth = addrwidth;
+ dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+ dom->id = domain_id();
+ dom->maxaddr = maxaddr;
+ dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+ if ((uintptr_t)dom->ptp & PAGE_MASK)
+ panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+ SLIST_INSERT_HEAD(&domhead, dom, next);
+
+ return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+ int i;
+ uint64_t *nlp;
+
+ if (level > 1) {
+ for (i = 0; i < 512; i++) {
+ if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+ continue;
+ if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+ continue;
+ nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+ vtd_free_ptp(nlp, level - 1);
+ }
+ }
+
+ bzero(ptp, PAGE_SIZE);
+ free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+ struct domain *dom;
+
+ dom = arg;
+
+ SLIST_REMOVE(&domhead, dom, domain, next);
+ vtd_free_ptp(dom->ptp, dom->pt_levels);
+ free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+ vtd_init,
+ vtd_cleanup,
+ vtd_enable,
+ vtd_disable,
+ vtd_create_domain,
+ vtd_destroy_domain,
+ vtd_create_mapping,
+ vtd_remove_mapping,
+ vtd_add_device,
+ vtd_remove_device,
+ vtd_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 0000000..c8447cc
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+ if (ops != NULL)
+ return ((*ops->init)());
+ else
+ return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+ if (ops != NULL && iommu_avail)
+ (*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_domain)(maxaddr));
+ else
+ return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_mapping)(domain, gpa, hpa, len));
+ else
+ return (len); /* XXX */
+}
+
+static __inline uint64_t
+IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->remove_mapping)(domain, gpa, len));
+ else
+ return (len); /* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_INVALIDATE_TLB(void *domain)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->invalidate_tlb)(domain);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+ int error, bus, slot, func;
+ vm_paddr_t maxaddr;
+ const char *name;
+ device_t dev;
+
+ if (vmm_is_intel())
+ ops = &iommu_ops_intel;
+ else if (vmm_is_amd())
+ ops = &iommu_ops_amd;
+ else
+ ops = NULL;
+
+ error = IOMMU_INIT();
+ if (error)
+ return;
+
+ iommu_avail = TRUE;
+
+ /*
+ * Create a domain for the devices owned by the host
+ */
+ maxaddr = vmm_mem_maxaddr();
+ host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+ if (host_domain == NULL)
+ panic("iommu_init: unable to create a host domain");
+
+ /*
+ * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
+ * the host
+ */
+ iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+ for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+ for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+ for (func = 0; func <= PCI_FUNCMAX; func++) {
+ dev = pci_find_dbsf(0, bus, slot, func);
+ if (dev == NULL)
+ continue;
+
+ /* skip passthrough devices */
+ name = device_get_name(dev);
+ if (name != NULL && strcmp(name, "ppt") == 0)
+ continue;
+
+ /* everything else belongs to the host domain */
+ iommu_add_device(host_domain, bus, slot, func);
+ }
+ }
+ }
+ IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+ IOMMU_DISABLE();
+ IOMMU_DESTROY_DOMAIN(host_domain);
+ IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+ IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+ uint64_t mapped, remaining;
+
+ remaining = len;
+
+ while (remaining > 0) {
+ mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+ gpa += mapped;
+ hpa += mapped;
+ remaining -= mapped;
+ }
+}
+
+void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+ uint64_t unmapped, remaining;
+
+ remaining = len;
+
+ while (remaining > 0) {
+ unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
+ gpa += unmapped;
+ remaining -= unmapped;
+ }
+}
+
+void *
+iommu_host_domain(void)
+{
+
+ return (host_domain);
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_invalidate_tlb(void *domain)
+{
+
+ IOMMU_INVALIDATE_TLB(domain);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 0000000..d5c1d6e
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define _IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+ vm_paddr_t hpa, uint64_t len);
+typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
+ uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+typedef void (*iommu_invalidate_tlb_t)(void *dom);
+
+struct iommu_ops {
+ iommu_init_func_t init; /* module wide */
+ iommu_cleanup_func_t cleanup;
+ iommu_enable_func_t enable;
+ iommu_disable_func_t disable;
+
+ iommu_create_domain_t create_domain; /* domain-specific */
+ iommu_destroy_domain_t destroy_domain;
+ iommu_create_mapping_t create_mapping;
+ iommu_remove_mapping_t remove_mapping;
+ iommu_add_device_t add_device;
+ iommu_remove_device_t remove_device;
+ iommu_invalidate_tlb_t invalidate_tlb;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void iommu_init(void);
+void iommu_cleanup(void);
+void *iommu_host_domain(void);
+void *iommu_create_domain(vm_paddr_t maxaddr);
+void iommu_destroy_domain(void *dom);
+void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+ size_t len);
+void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
+void iommu_add_device(void *dom, int bus, int slot, int func);
+void iommu_remove_device(void *dom, int bus, int slot, int func);
+void iommu_invalidate_tlb(void *domain);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 0000000..fdf136b
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,610 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+/* XXX locking */
+
+#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1)
+#define MAX_MSIMSGS 32
+
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
+struct pptintr_arg { /* pptintr(pptintr_arg) */
+ struct pptdev *pptdev;
+ int vec;
+ int vcpu;
+};
+
+static struct pptdev {
+ device_t dev;
+ struct vm *vm; /* owner of this device */
+ struct vm_memory_segment mmio[MAX_MMIOSEGS];
+ struct {
+ int num_msgs; /* guest state */
+
+ int startrid; /* host state */
+ struct resource *res[MAX_MSIMSGS];
+ void *cookie[MAX_MSIMSGS];
+ struct pptintr_arg arg[MAX_MSIMSGS];
+ } msi;
+
+ struct {
+ int num_msgs;
+ int startrid;
+ int msix_table_rid;
+ struct resource *msix_table_res;
+ struct resource **res;
+ void **cookie;
+ struct pptintr_arg *arg;
+ } msix;
+} pptdevs[32];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+ int bus, slot, func;
+ struct pci_devinfo *dinfo;
+
+ dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+
+ /*
+ * To qualify as a pci passthrough device a device must:
+ * - be allowed by administrator to be used in this role
+ * - be an endpoint device
+ */
+ if (vmm_is_pptdev(bus, slot, func) &&
+ (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+ return (0);
+ else
+ return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+ int n;
+
+ if (num_pptdevs >= MAX_PPTDEVS) {
+ printf("ppt_attach: maximum number of pci passthrough devices "
+ "exceeded\n");
+ return (ENXIO);
+ }
+
+ n = num_pptdevs++;
+ pptdevs[n].dev = dev;
+
+ if (bootverbose)
+ device_printf(dev, "attached\n");
+
+ return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+ /*
+ * XXX check whether there are any pci passthrough devices assigned
+ * to guests before we allow this driver to detach.
+ */
+
+ return (0);
+}
+
+static device_method_t ppt_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, ppt_probe),
+ DEVMETHOD(device_attach, ppt_attach),
+ DEVMETHOD(device_detach, ppt_detach),
+ {0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+ device_t dev;
+ int i, b, s, f;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ dev = pptdevs[i].dev;
+ b = pci_get_bus(dev);
+ s = pci_get_slot(dev);
+ f = pci_get_function(dev);
+ if (bus == b && slot == s && func == f)
+ return (&pptdevs[i]);
+ }
+ return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+ int i;
+ struct vm_memory_segment *seg;
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0)
+ continue;
+ (void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+ bzero(seg, sizeof(struct vm_memory_segment));
+ }
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+ int i, rid;
+ void *cookie;
+ struct resource *res;
+
+ if (ppt->msi.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msi.num_msgs; i++) {
+ rid = ppt->msi.startrid + i;
+ res = ppt->msi.res[i];
+ cookie = ppt->msi.cookie[i];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msi.res[i] = NULL;
+ ppt->msi.cookie[i] = NULL;
+ }
+
+ if (ppt->msi.startrid == 1)
+ pci_release_msi(ppt->dev);
+
+ ppt->msi.num_msgs = 0;
+}
+
+static void
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+ int rid;
+ struct resource *res;
+ void *cookie;
+
+ rid = ppt->msix.startrid + idx;
+ res = ppt->msix.res[idx];
+ cookie = ppt->msix.cookie[idx];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msix.res[idx] = NULL;
+ ppt->msix.cookie[idx] = NULL;
+}
+
+static void
+ppt_teardown_msix(struct pptdev *ppt)
+{
+ int i;
+
+ if (ppt->msix.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msix.num_msgs; i++)
+ ppt_teardown_msix_intr(ppt, i);
+
+ if (ppt->msix.msix_table_res) {
+ bus_release_resource(ppt->dev, SYS_RES_MEMORY,
+ ppt->msix.msix_table_rid,
+ ppt->msix.msix_table_res);
+ ppt->msix.msix_table_res = NULL;
+ ppt->msix.msix_table_rid = 0;
+ }
+
+ free(ppt->msix.res, M_PPTMSIX);
+ free(ppt->msix.cookie, M_PPTMSIX);
+ free(ppt->msix.arg, M_PPTMSIX);
+
+ pci_release_msi(ppt->dev);
+
+ ppt->msix.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is owned by a different VM then we
+ * cannot change its owner.
+ */
+ if (ppt->vm != NULL && ppt->vm != vm)
+ return (EBUSY);
+
+ ppt->vm = vm;
+ iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is not owned by this 'vm' then bail out.
+ */
+ if (ppt->vm != vm)
+ return (EBUSY);
+ ppt_unmap_mmio(vm, ppt);
+ ppt_teardown_msi(ppt);
+ ppt_teardown_msix(ppt);
+ iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+ ppt->vm = NULL;
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+ int i, bus, slot, func;
+ device_t dev;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ if (pptdevs[i].vm == vm) {
+ dev = pptdevs[i].dev;
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+ ppt_unassign_device(vm, bus, slot, func);
+ }
+ }
+
+ return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ int i, error;
+ struct vm_memory_segment *seg;
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ if (ppt->vm != vm)
+ return (EBUSY);
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0) {
+ error = vm_map_mmio(vm, gpa, len, hpa);
+ if (error == 0) {
+ seg->gpa = gpa;
+ seg->len = len;
+ }
+ return (error);
+ }
+ }
+ return (ENOSPC);
+ }
+ return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+ int vec;
+ struct pptdev *ppt;
+ struct pptintr_arg *pptarg;
+
+ pptarg = arg;
+ ppt = pptarg->pptdev;
+ vec = pptarg->vec;
+
+ if (ppt->vm != NULL)
+ (void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
+ else {
+ /*
+ * XXX
+ * This is not expected to happen - panic?
+ */
+ }
+
+ /*
+ * For legacy interrupts give other filters a chance in case
+ * the interrupt was not generated by the passthrough device.
+ */
+ if (ppt->msi.startrid == 0)
+ return (FILTER_STRAY);
+ else
+ return (FILTER_HANDLED);
+}
+
+/*
+ * XXX
+ * When we try to free the MSI resource the kernel will bind the thread to
+ * the host cpu was originally handling the MSI. The function freeing the
+ * MSI vector (apic_free_vector()) will panic the kernel if the thread
+ * is already bound to a cpu.
+ *
+ * So, we temporarily unbind the vcpu thread before freeing the MSI resource.
+ */
+static void
+PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
+{
+ int pincpu = -1;
+
+ vm_get_pinning(vm, vcpu, &pincpu);
+
+ if (pincpu >= 0)
+ vm_set_pinning(vm, vcpu, -1);
+
+ ppt_teardown_msi(ppt);
+
+ if (pincpu >= 0)
+ vm_set_pinning(vm, vcpu, pincpu);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec)
+{
+ int i, rid, flags;
+ int msi_count, startrid, error, tmp;
+ struct pptdev *ppt;
+
+ if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+ (vector < 0 || vector > 255) ||
+ (numvec < 0 || numvec > MAX_MSIMSGS))
+ return (EINVAL);
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ /* Free any allocated resources */
+ PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+
+ if (numvec == 0) /* nothing more to do */
+ return (0);
+
+ flags = RF_ACTIVE;
+ msi_count = pci_msi_count(ppt->dev);
+ if (msi_count == 0) {
+ startrid = 0; /* legacy interrupt */
+ msi_count = 1;
+ flags |= RF_SHAREABLE;
+ } else
+ startrid = 1; /* MSI */
+
+ /*
+ * The device must be capable of supporting the number of vectors
+ * the guest wants to allocate.
+ */
+ if (numvec > msi_count)
+ return (EINVAL);
+
+ /*
+ * Make sure that we can allocate all the MSI vectors that are needed
+ * by the guest.
+ */
+ if (startrid == 1) {
+ tmp = numvec;
+ error = pci_alloc_msi(ppt->dev, &tmp);
+ if (error)
+ return (error);
+ else if (tmp != numvec) {
+ pci_release_msi(ppt->dev);
+ return (ENOSPC);
+ } else {
+ /* success */
+ }
+ }
+
+ ppt->msi.startrid = startrid;
+
+ /*
+ * Allocate the irq resource and attach it to the interrupt handler.
+ */
+ for (i = 0; i < numvec; i++) {
+ ppt->msi.num_msgs = i + 1;
+ ppt->msi.cookie[i] = NULL;
+
+ rid = startrid + i;
+ ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, flags);
+ if (ppt->msi.res[i] == NULL)
+ break;
+
+ ppt->msi.arg[i].pptdev = ppt;
+ ppt->msi.arg[i].vec = vector + i;
+ ppt->msi.arg[i].vcpu = destcpu;
+
+ error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+ INTR_TYPE_NET | INTR_MPSAFE,
+ pptintr, NULL, &ppt->msi.arg[i],
+ &ppt->msi.cookie[i]);
+ if (error != 0)
+ break;
+ }
+
+ if (i < numvec) {
+ PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+ struct pptdev *ppt;
+ struct pci_devinfo *dinfo;
+ int numvec, alloced, rid, error;
+ size_t res_size, cookie_size, arg_size;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ dinfo = device_get_ivars(ppt->dev);
+ if (!dinfo)
+ return (ENXIO);
+
+ /*
+ * First-time configuration:
+ * Allocate the MSI-X table
+ * Allocate the IRQ resources
+ * Set up some variables in ppt->msix
+ */
+ if (ppt->msix.num_msgs == 0) {
+ numvec = pci_msix_count(ppt->dev);
+ if (numvec <= 0)
+ return (EINVAL);
+
+ ppt->msix.startrid = 1;
+ ppt->msix.num_msgs = numvec;
+
+ res_size = numvec * sizeof(ppt->msix.res[0]);
+ cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+ arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+ ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+ ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
+ M_WAITOK | M_ZERO);
+ ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+
+ rid = dinfo->cfg.msix.msix_table_bar;
+ ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
+ SYS_RES_MEMORY, &rid, RF_ACTIVE);
+
+ if (ppt->msix.msix_table_res == NULL) {
+ ppt_teardown_msix(ppt);
+ return (ENOSPC);
+ }
+ ppt->msix.msix_table_rid = rid;
+
+ alloced = numvec;
+ error = pci_alloc_msix(ppt->dev, &alloced);
+ if (error || alloced != numvec) {
+ ppt_teardown_msix(ppt);
+ return (error == 0 ? ENOSPC: error);
+ }
+ }
+
+ if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* Tear down the IRQ if it's already set up */
+ ppt_teardown_msix_intr(ppt, idx);
+
+ /* Allocate the IRQ resource */
+ ppt->msix.cookie[idx] = NULL;
+ rid = ppt->msix.startrid + idx;
+ ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, RF_ACTIVE);
+ if (ppt->msix.res[idx] == NULL)
+ return (ENXIO);
+
+ ppt->msix.arg[idx].pptdev = ppt;
+ ppt->msix.arg[idx].vec = msg;
+ ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+
+ /* Setup the MSI-X interrupt */
+ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+ INTR_TYPE_NET | INTR_MPSAFE,
+ pptintr, NULL, &ppt->msix.arg[idx],
+ &ppt->msix.cookie[idx]);
+
+ if (error != 0) {
+ bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+ ppt->msix.cookie[idx] = NULL;
+ ppt->msix.res[idx] = NULL;
+ return (ENXIO);
+ }
+ } else {
+ /* Masked, tear it down if it's already been set up */
+ ppt_teardown_msix_intr(ppt, idx);
+ }
+
+ return (0);
+}
+
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 0000000..63c8228
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define _IO_PPT_H_
+
+int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_all(struct vm *vm);
+int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec);
+int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 0000000..cd6c5d1
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+ SLIST_ENTRY(vdev) entry;
+ struct vdev_ops *ops;
+ void *dev;
+};
+static SLIST_HEAD(, vdev) vdev_head;
+static int vdev_count;
+
+struct vdev_region {
+ SLIST_ENTRY(vdev_region) entry;
+ struct vdev_ops *ops;
+ void *dev;
+ struct io_region *io;
+};
+static SLIST_HEAD(, vdev_region) region_head;
+static int region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT (0)
+#define VDEV_RESET (1)
+#define VDEV_HALT (2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+ struct vdev *vd;
+ int rc;
+
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+ switch (event) {
+ case VDEV_INIT:
+ rc = vd->ops->init(vd->dev);
+ break;
+ case VDEV_RESET:
+ rc = vd->ops->reset(vd->dev);
+ break;
+ case VDEV_HALT:
+ rc = vd->ops->halt(vd->dev);
+ break;
+ default:
+ break;
+ }
+ if (rc) {
+ printf("vdev %s init failed rc=%d\n",
+ vd->ops->name, rc);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+int
+vdev_init(void)
+{
+ return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+ return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+ return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+ SLIST_INIT(&vdev_head);
+ vdev_count = 0;
+
+ SLIST_INIT(&region_head);
+ region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+ struct vdev *vd;
+
+ // TODO: locking
+ while (!SLIST_EMPTY(&vdev_head)) {
+ vd = SLIST_FIRST(&vdev_head);
+ SLIST_REMOVE_HEAD(&vdev_head, entry);
+ free(vd, M_VDEV);
+ vdev_count--;
+ }
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+ struct vdev *vd;
+ vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO);
+ vd->ops = ops;
+ vd->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&vdev_head, vd, entry);
+ vdev_count++;
+ return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+ struct vdev *vd, *found;
+
+ found = NULL;
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ if (vd->dev == dev) {
+ found = vd;
+ }
+ }
+
+ if (found) {
+ SLIST_REMOVE(&vdev_head, found, vdev, entry);
+ free(found, M_VDEV);
+ }
+}
+
+#define IN_RANGE(val, start, end) \
+ (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev)
+{
+ struct vdev_region *region, *found;
+ uint64_t region_base;
+ uint64_t region_end;
+
+ found = NULL;
+
+ // TODO: locking
+ // FIXME: we should verify we are in the context the current
+ // vcpu here as well.
+ SLIST_FOREACH(region, &region_head, entry) {
+ region_base = region->io->base;
+ region_end = region_base + region->io->len;
+ if (IN_RANGE(io->base, region_base, region_end) &&
+ IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+ (dev && dev == region->dev)) {
+ found = region;
+ break;
+ }
+ }
+ return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+ if (region) {
+ return -EEXIST;
+ }
+
+ region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+ region->io = io;
+ region->ops = ops;
+ region->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&region_head, region, entry);
+ region_count++;
+
+ return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+
+ if (region) {
+ SLIST_REMOVE(&region_head, region, vdev_region, entry);
+ free(region, M_VDEV);
+ region_count--;
+ }
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+ struct vdev_region *region;
+ struct io_region io;
+ region_attr_t attr;
+ int rc;
+
+ io.base = gpa;
+ io.len = size;
+
+ region = vdev_find_region(&io, NULL);
+ if (!region)
+ return -EINVAL;
+
+ attr = (read) ? MMIO_READ : MMIO_WRITE;
+ if (!(region->io->attr & attr))
+ return -EPERM;
+
+ if (read)
+ rc = region->ops->memread(region->dev, gpa, size, data);
+ else
+ rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+ return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+ return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 0000000..6feeba8
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define _VDEV_H_
+
+typedef enum {
+ BYTE = 1,
+ WORD = 2,
+ DWORD = 4,
+ QWORD = 8,
+} opsize_t;
+
+typedef enum {
+ MMIO_READ = 1,
+ MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+ uint64_t base;
+ uint64_t len;
+ region_attr_t attr;
+ int vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+ const char *name;
+ vdev_init_t init;
+ vdev_reset_t reset;
+ vdev_halt_t halt;
+ vdev_memread_t memread;
+ vdev_memwrite_t memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif /* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 0000000..15fc6c2
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,901 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/clock.h>
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define VLAPIC_CTR0(vlapic, format) \
+ VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define VLAPIC_CTR1(vlapic, format, p1) \
+ VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define VLAPIC_CTR_IRR(vlapic, msg) \
+do { \
+ uint32_t *irrptr = &(vlapic)->apic.irr0; \
+ irrptr[0] = irrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
+} while (0)
+
+#define VLAPIC_CTR_ISR(vlapic, msg) \
+do { \
+ uint32_t *isrptr = &(vlapic)->apic.isr0; \
+ isrptr[0] = isrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define PRIO(x) ((x) >> 4)
+
+#define VLAPIC_VERSION (16)
+#define VLAPIC_MAXLVT_ENTRIES (5)
+
+#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
+
+enum boot_state {
+ BS_INIT,
+ BS_SIPI,
+ BS_RUNNING
+};
+
+struct vlapic {
+ struct vm *vm;
+ int vcpuid;
+
+ struct io_region *mmio;
+ struct vdev_ops *ops;
+ struct LAPIC apic;
+
+ int esr_update;
+
+ int divisor;
+ int ccr_ticks;
+
+ /*
+ * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+ * A vector is popped from the stack when the processor does an EOI.
+ * The vector on the top of the stack is used to compute the
+ * Processor Priority in conjunction with the TPR.
+ */
+ uint8_t isrvec_stk[ISRVEC_STK_SIZE];
+ int isrvec_stk_top;
+
+ uint64_t msr_apicbase;
+ enum boot_state boot_state;
+};
+
+#define VLAPIC_BUS_FREQ tsc_freq
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+ switch (dcr & 0xB) {
+ case APIC_TDCR_2:
+ return (2);
+ case APIC_TDCR_4:
+ return (4);
+ case APIC_TDCR_8:
+ return (8);
+ case APIC_TDCR_16:
+ return (16);
+ case APIC_TDCR_32:
+ return (32);
+ case APIC_TDCR_64:
+ return (64);
+ case APIC_TDCR_128:
+ return (128);
+ default:
+ panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+ }
+}
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+ int i;
+ for (i = 0; i < num_lvt; i++) {
+ *lvts |= APIC_LVT_M;
+ lvts += 4;
+ }
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+ printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+ *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+ *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->esr = 0; // XXX
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->version = VLAPIC_VERSION;
+ lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+ lapic->dfr = 0xffffffff;
+ lapic->svr = APIC_SVR_VECTOR;
+ vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+
+ memset(lapic, 0, sizeof(*lapic));
+ lapic->apr = vlapic->vcpuid;
+ vlapic_init_ipi(vlapic);
+ vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
+
+ if (vlapic->vcpuid == 0)
+ vlapic->boot_state = BS_RUNNING; /* BSP */
+ else
+ vlapic->boot_state = BS_INIT; /* AP */
+
+ return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+ return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_unregister_region(vlapic, vlapic->mmio);
+ return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr;
+ int idx;
+
+ if (vector < 0 || vector >= 256)
+ panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+ idx = (vector / 32) * 4;
+ irrptr = &lapic->irr0;
+ atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+ uint32_t icr_timer;
+
+ icr_timer = vlapic->apic.icr_timer;
+
+ vlapic->ccr_ticks = ticks;
+ if (elapsed < icr_timer)
+ vlapic->apic.ccr_timer = icr_timer - elapsed;
+ else {
+ /*
+ * This can happen when the guest is trying to run its local
+ * apic timer higher that the setting of 'hz' in the host.
+ *
+ * We deal with this by running the guest local apic timer
+ * at the rate of the host's 'hz' setting.
+ */
+ vlapic->apic.ccr_timer = 0;
+ }
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int i;
+
+ if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+ panic("vlapic_get_lvt: invalid LVT\n");
+ }
+ i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+ return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+ int i;
+ uint32_t *isrptr;
+
+ isrptr = &vlapic->apic.isr0;
+ for (i = 0; i < 8; i++)
+ printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+ for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+ printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+ int isrvec, tpr, ppr;
+
+ /*
+ * Note that the value on the stack at index 0 is always 0.
+ *
+ * This is a placeholder for the value of ISRV when none of the
+ * bits is set in the ISRx registers.
+ */
+ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+ tpr = vlapic->apic.tpr;
+
+#if 1
+ {
+ int i, lastprio, curprio, vector, idx;
+ uint32_t *isrptr;
+
+ if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+ panic("isrvec_stk is corrupted: %d", isrvec);
+
+ /*
+ * Make sure that the priority of the nested interrupts is
+ * always increasing.
+ */
+ lastprio = -1;
+ for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+ curprio = PRIO(vlapic->isrvec_stk[i]);
+ if (curprio <= lastprio) {
+ dump_isrvec_stk(vlapic);
+ panic("isrvec_stk does not satisfy invariant");
+ }
+ lastprio = curprio;
+ }
+
+ /*
+ * Make sure that each bit set in the ISRx registers has a
+ * corresponding entry on the isrvec stack.
+ */
+ i = 1;
+ isrptr = &vlapic->apic.isr0;
+ for (vector = 0; vector < 256; vector++) {
+ idx = (vector / 32) * 4;
+ if (isrptr[idx] & (1 << (vector % 32))) {
+ if (i > vlapic->isrvec_stk_top ||
+ vlapic->isrvec_stk[i] != vector) {
+ dump_isrvec_stk(vlapic);
+ panic("ISR and isrvec_stk out of sync");
+ }
+ i++;
+ }
+ }
+ }
+#endif
+
+ if (PRIO(tpr) >= PRIO(isrvec))
+ ppr = tpr;
+ else
+ ppr = isrvec & 0xf0;
+
+ vlapic->apic.ppr = ppr;
+ VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *isrptr;
+ int i, idx, bitpos;
+
+ isrptr = &lapic->isr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ bitpos = fls(isrptr[idx]);
+ if (bitpos != 0) {
+ if (vlapic->isrvec_stk_top <= 0) {
+ panic("invalid vlapic isrvec_stk_top %d",
+ vlapic->isrvec_stk_top);
+ }
+ isrptr[idx] &= ~(1 << (bitpos - 1));
+ VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+ vlapic->isrvec_stk_top--;
+ vlapic_update_ppr(vlapic);
+ return;
+ }
+ }
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+ return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+ int vector;
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+ vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+ vlapic_set_intr_ready(vlapic, vector);
+ }
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+ int i;
+ cpuset_t dmask;
+ uint32_t dest, vec, mode;
+ struct vlapic *vlapic2;
+ struct vm_exit *vmexit;
+
+ if (x2apic(vlapic))
+ dest = icrval >> 32;
+ else
+ dest = icrval >> (32 + 24);
+ vec = icrval & APIC_VECTOR_MASK;
+ mode = icrval & APIC_DELMODE_MASK;
+
+ if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+ switch (icrval & APIC_DEST_MASK) {
+ case APIC_DEST_DESTFLD:
+ CPU_SETOF(dest, &dmask);
+ break;
+ case APIC_DEST_SELF:
+ CPU_SETOF(vlapic->vcpuid, &dmask);
+ break;
+ case APIC_DEST_ALLISELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ break;
+ case APIC_DEST_ALLESELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ CPU_CLR(vlapic->vcpuid, &dmask);
+ break;
+ }
+
+ while ((i = cpusetobj_ffs(&dmask)) != 0) {
+ i--;
+ CPU_CLR(i, &dmask);
+ if (mode == APIC_DELMODE_FIXED)
+ lapic_set_intr(vlapic->vm, i, vec);
+ else
+ vm_inject_nmi(vlapic->vm, i);
+ }
+
+ return (0); /* handled completely in the kernel */
+ }
+
+ if (mode == APIC_DELMODE_INIT) {
+ if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+ return (0);
+
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /* move from INIT to waiting-for-SIPI state */
+ if (vlapic2->boot_state == BS_INIT) {
+ vlapic2->boot_state = BS_SIPI;
+ }
+
+ return (0);
+ }
+ }
+
+ if (mode == APIC_DELMODE_STARTUP) {
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /*
+ * Ignore SIPIs in any state other than wait-for-SIPI
+ */
+ if (vlapic2->boot_state != BS_SIPI)
+ return (0);
+
+ vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+ vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+ vmexit->u.spinup_ap.vcpu = dest;
+ vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+ /*
+ * XXX this assumes that the startup IPI always succeeds
+ */
+ vlapic2->boot_state = BS_RUNNING;
+ vm_activate_cpu(vlapic2->vm, dest);
+
+ return (0);
+ }
+ }
+
+ /*
+ * This will cause a return to userland.
+ */
+ return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int idx, i, bitpos, vector;
+ uint32_t *irrptr, val;
+
+ irrptr = &lapic->irr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ val = atomic_load_acq_int(&irrptr[idx]);
+ bitpos = fls(val);
+ if (bitpos != 0) {
+ vector = i * 32 + (bitpos - 1);
+ if (PRIO(vector) > PRIO(lapic->ppr)) {
+ VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+ return (vector);
+ } else
+ break;
+ }
+ }
+ VLAPIC_CTR0(vlapic, "no pending intr");
+ return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr, *isrptr;
+ int idx, stk_top;
+
+ /*
+ * clear the ready bit for vector being accepted in irr
+ * and set the vector as in service in isr.
+ */
+ idx = (vector / 32) * 4;
+
+ irrptr = &lapic->irr0;
+ atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+ isrptr = &lapic->isr0;
+ isrptr[idx] |= 1 << (vector % 32);
+ VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+ /*
+ * Update the PPR
+ */
+ vlapic->isrvec_stk_top++;
+
+ stk_top = vlapic->isrvec_stk_top;
+ if (stk_top >= ISRVEC_STK_SIZE)
+ panic("isrvec_stk_top overflow %d", stk_top);
+
+ vlapic->isrvec_stk[stk_top] = vector;
+ vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int i;
+
+ if (offset > sizeof(*lapic)) {
+ *data = 0;
+ return 0;
+ }
+
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ if (x2apic(vlapic))
+ *data = vlapic->vcpuid;
+ else
+ *data = vlapic->vcpuid << 24;
+ break;
+ case APIC_OFFSET_VER:
+ *data = lapic->version;
+ break;
+ case APIC_OFFSET_TPR:
+ *data = lapic->tpr;
+ break;
+ case APIC_OFFSET_APR:
+ *data = lapic->apr;
+ break;
+ case APIC_OFFSET_PPR:
+ *data = lapic->ppr;
+ break;
+ case APIC_OFFSET_EOI:
+ *data = lapic->eoi;
+ break;
+ case APIC_OFFSET_LDR:
+ *data = lapic->ldr;
+ break;
+ case APIC_OFFSET_DFR:
+ *data = lapic->dfr;
+ break;
+ case APIC_OFFSET_SVR:
+ *data = lapic->svr;
+ break;
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ i = (offset - APIC_OFFSET_ISR0) >> 2;
+ reg = &lapic->isr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ i = (offset - APIC_OFFSET_TMR0) >> 2;
+ reg = &lapic->tmr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ i = (offset - APIC_OFFSET_IRR0) >> 2;
+ reg = &lapic->irr0;
+ *data = atomic_load_acq_int(reg + i);
+ break;
+ case APIC_OFFSET_ESR:
+ *data = lapic->esr;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ *data = lapic->icr_lo;
+ break;
+ case APIC_OFFSET_ICR_HI:
+ *data = lapic->icr_hi;
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ *data = *(reg);
+ break;
+ case APIC_OFFSET_ICR:
+ *data = lapic->icr_timer;
+ break;
+ case APIC_OFFSET_CCR:
+ *data = vlapic_get_ccr(vlapic);
+ break;
+ case APIC_OFFSET_DCR:
+ *data = lapic->dcr_timer;
+ break;
+ case APIC_OFFSET_RRR:
+ default:
+ *data = 0;
+ break;
+ }
+ return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int retval;
+
+ if (offset > sizeof(*lapic)) {
+ return 0;
+ }
+
+ retval = 0;
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ break;
+ case APIC_OFFSET_TPR:
+ lapic->tpr = data & 0xff;
+ vlapic_update_ppr(vlapic);
+ break;
+ case APIC_OFFSET_EOI:
+ vlapic_process_eoi(vlapic);
+ break;
+ case APIC_OFFSET_LDR:
+ break;
+ case APIC_OFFSET_DFR:
+ break;
+ case APIC_OFFSET_SVR:
+ lapic->svr = data;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ if (!x2apic(vlapic)) {
+ data &= 0xffffffff;
+ data |= (uint64_t)lapic->icr_hi << 32;
+ }
+ retval = lapic_process_icr(vlapic, data);
+ break;
+ case APIC_OFFSET_ICR_HI:
+ if (!x2apic(vlapic)) {
+ retval = 0;
+ lapic->icr_hi = data;
+ }
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ if (!(lapic->svr & APIC_SVR_ENABLE)) {
+ data |= APIC_LVT_M;
+ }
+ *reg = data;
+ // vlapic_dump_lvt(offset, reg);
+ break;
+ case APIC_OFFSET_ICR:
+ lapic->icr_timer = data;
+ vlapic_start_timer(vlapic, 0);
+ break;
+
+ case APIC_OFFSET_DCR:
+ lapic->dcr_timer = data;
+ vlapic->divisor = vlapic_timer_divisor(data);
+ break;
+
+ case APIC_OFFSET_ESR:
+ vlapic_update_errors(vlapic);
+ break;
+ case APIC_OFFSET_VER:
+ case APIC_OFFSET_APR:
+ case APIC_OFFSET_PPR:
+ case APIC_OFFSET_RRR:
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ case APIC_OFFSET_CCR:
+ default:
+ // Read only.
+ break;
+ }
+
+ return (retval);
+}
+
+int
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+ int curticks, delta, periodic, fired;
+ uint32_t ccr;
+ uint32_t decrement, leftover;
+
+restart:
+ curticks = ticks;
+ delta = curticks - vlapic->ccr_ticks;
+
+ /* Local APIC timer is disabled */
+ if (vlapic->apic.icr_timer == 0)
+ return (-1);
+
+ /* One-shot mode and timer has already counted down to zero */
+ periodic = vlapic_periodic_timer(vlapic);
+ if (!periodic && vlapic->apic.ccr_timer == 0)
+ return (-1);
+ /*
+ * The 'curticks' and 'ccr_ticks' are out of sync by more than
+ * 2^31 ticks. We deal with this by restarting the timer.
+ */
+ if (delta < 0) {
+ vlapic_start_timer(vlapic, 0);
+ goto restart;
+ }
+
+ fired = 0;
+ decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+
+ vlapic->ccr_ticks = curticks;
+ ccr = vlapic->apic.ccr_timer;
+
+ while (delta-- > 0) {
+ if (ccr > decrement) {
+ ccr -= decrement;
+ continue;
+ }
+
+ /* Trigger the local apic timer interrupt */
+ vlapic_fire_timer(vlapic);
+ if (periodic) {
+ leftover = decrement - ccr;
+ vlapic_start_timer(vlapic, leftover);
+ ccr = vlapic->apic.ccr_timer;
+ } else {
+ /*
+ * One-shot timer has counted down to zero.
+ */
+ ccr = 0;
+ }
+ fired = 1;
+ break;
+ }
+
+ vlapic->apic.ccr_timer = ccr;
+
+ if (!fired)
+ return ((ccr / decrement) + 1);
+ else
+ return (0);
+}
+
+struct vdev_ops vlapic_dev_ops = {
+ .name = "vlapic",
+ .init = vlapic_op_init,
+ .reset = vlapic_op_reset,
+ .halt = vlapic_op_halt,
+ .memread = vlapic_op_mem_read,
+ .memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+ struct vlapic *vlapic;
+
+ vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+ vlapic->vm = vm;
+ vlapic->vcpuid = vcpuid;
+
+ vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
+
+ if (vcpuid == 0)
+ vlapic->msr_apicbase |= APICBASE_BSP;
+
+ vlapic->ops = &vlapic_dev_ops;
+
+ vlapic->mmio = vlapic_mmio + vcpuid;
+ vlapic->mmio->base = DEFAULT_APIC_BASE;
+ vlapic->mmio->len = PAGE_SIZE;
+ vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+ vlapic->mmio->vcpu = vcpuid;
+
+ vdev_register(&vlapic_dev_ops, vlapic);
+
+ vlapic_op_init(vlapic);
+
+ return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+ vlapic_op_halt(vlapic);
+ vdev_unregister(vlapic);
+ free(vlapic, M_VLAPIC);
+}
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+ return (vlapic->msr_apicbase);
+}
+
+void
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
+{
+ int err;
+ enum x2apic_state state;
+
+ err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
+ if (err)
+ panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
+
+ if (state == X2APIC_DISABLED)
+ val &= ~APICBASE_X2APIC;
+
+ vlapic->msr_apicbase = val;
+}
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, vcpuid);
+
+ if (state == X2APIC_DISABLED)
+ vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 0000000..00de019
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define _VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+
+/*
+ * Map of APIC Registers: Offset Description Access
+ */
+#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W
+#define APIC_OFFSET_VER 0x30 // Local APIC Version R
+#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W
+#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R
+#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R
+#define APIC_OFFSET_EOI 0xB0 // EOI Register W
+#define APIC_OFFSET_RRR 0xC0 // Remote read R
+#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W
+#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W
+#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
+#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R
+#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R
+#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R
+#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R
+#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R
+#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R
+#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R
+#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R
+#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R
+#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R
+#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R
+#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R
+#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R
+#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R
+#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R
+#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R
+#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R
+#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R
+#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R
+#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R
+#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R
+#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R
+#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R
+#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R
+#define APIC_OFFSET_ESR 0x280 // Error Status Register R
+#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W
+#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W
+#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W
+#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W
+#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W
+#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W
+#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W
+#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R
+#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define ISRVEC_STK_SIZE (16 + 1)
+
+enum x2apic_state;
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+int vlapic_timer_tick(struct vlapic *vlapic);
+
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+
+#endif /* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 0000000..a4dea79
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,1022 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+ int flags;
+ enum vcpu_state state;
+ struct mtx mtx;
+ int pincpu; /* host cpuid this vcpu is bound to */
+ int hostcpu; /* host cpuid this vcpu last ran on */
+ uint64_t guest_msrs[VMM_MSR_NUM];
+ struct vlapic *vlapic;
+ int vcpuid;
+ struct savefpu *guestfpu; /* guest fpu state */
+ void *stats;
+ struct vm_exit exitinfo;
+ enum x2apic_state x2apic_state;
+ int nmi_pending;
+};
+#define VCPU_F_PINNED 0x0001
+
+#define VCPU_PINCPU(vm, vcpuid) \
+ ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
+
+#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
+
+#define VCPU_PIN(vm, vcpuid, host_cpuid) \
+do { \
+ vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \
+ vm->vcpu[vcpuid].pincpu = host_cpuid; \
+} while(0)
+
+#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
+#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
+
+#define VM_MAX_MEMORY_SEGMENTS 2
+
+struct vm {
+ void *cookie; /* processor-specific data */
+ void *iommu; /* iommu-specific data */
+ struct vcpu vcpu[VM_MAXCPU];
+ int num_mem_segs;
+ struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ char name[VM_MAX_NAMELEN];
+
+ /*
+ * Set of active vcpus.
+ * An active vcpu is one that has been started implicitly (BSP) or
+ * explicitly (AP) by sending it a startup ipi.
+ */
+ cpuset_t active_cpus;
+};
+
+static struct vmm_ops *ops;
+#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
+#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
+
+#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
+#define VMRUN(vmi, vcpu, rip) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
+ (ops != NULL ? \
+ (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
+ ENXIO)
+#define VMMMAP_GET(vmi, gpa) \
+ (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define VMGETREG(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETREG(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define VMGETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMSETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
+ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define VMGETCAP(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETCAP(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
+#define fpu_stop_emulating() clts()
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+ vlapic_cleanup(vcpu->vlapic);
+ vmm_stat_free(vcpu->stats);
+ fpu_save_area_free(vcpu->guestfpu);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpu_id];
+
+ vcpu_lock_init(vcpu);
+ vcpu->hostcpu = NOCPU;
+ vcpu->vcpuid = vcpu_id;
+ vcpu->vlapic = vlapic_init(vm, vcpu_id);
+ vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
+ vcpu->guestfpu = fpu_save_area_alloc();
+ fpu_save_area_reset(vcpu->guestfpu);
+ vcpu->stats = vmm_stat_alloc();
+}
+
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+ struct vcpu *vcpu;
+
+ if (cpuid < 0 || cpuid >= VM_MAXCPU)
+ panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+ vcpu = &vm->vcpu[cpuid];
+
+ return (&vcpu->exitinfo);
+}
+
+static int
+vmm_init(void)
+{
+ int error;
+
+ vmm_host_state_init();
+ vmm_ipi_init();
+
+ error = vmm_mem_init();
+ if (error)
+ return (error);
+
+ if (vmm_is_intel())
+ ops = &vmm_ops_intel;
+ else if (vmm_is_amd())
+ ops = &vmm_ops_amd;
+ else
+ return (ENXIO);
+
+ vmm_msr_init();
+
+ return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ vmmdev_init();
+ iommu_init();
+ error = vmm_init();
+ break;
+ case MOD_UNLOAD:
+ error = vmmdev_cleanup();
+ if (error == 0) {
+ iommu_cleanup();
+ vmm_ipi_cleanup();
+ error = VMM_CLEANUP();
+ }
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t vmm_kmod = {
+ "vmm",
+ vmm_handler,
+ NULL
+};
+
+/*
+ * Execute the module load handler after the pci passthru driver has had
+ * a chance to claim devices. We need this information at the time we do
+ * iommu initialization.
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+struct vm *
+vm_create(const char *name)
+{
+ int i;
+ struct vm *vm;
+ vm_paddr_t maxaddr;
+
+ const int BSP = 0;
+
+ if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+ return (NULL);
+
+ vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+ strcpy(vm->name, name);
+ vm->cookie = VMINIT(vm);
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vcpu_init(vm, i);
+ guest_msrs_init(vm, i);
+ }
+
+ maxaddr = vmm_mem_maxaddr();
+ vm->iommu = iommu_create_domain(maxaddr);
+ vm_activate_cpu(vm, BSP);
+
+ return (vm);
+}
+
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+ size_t len;
+ vm_paddr_t hpa;
+ void *host_domain;
+
+ host_domain = iommu_host_domain();
+
+ len = 0;
+ while (len < seg->len) {
+ hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+ if (hpa == (vm_paddr_t)-1) {
+ panic("vm_free_mem_segs: cannot free hpa "
+ "associated with gpa 0x%016lx", seg->gpa + len);
+ }
+
+ /*
+ * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+ * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+ */
+ iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+ iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+
+ vmm_mem_free(hpa, PAGE_SIZE);
+
+ len += PAGE_SIZE;
+ }
+
+ /*
+ * Invalidate cached translations associated with 'vm->iommu' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(vm->iommu);
+
+ bzero(seg, sizeof(struct vm_memory_segment));
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+ int i;
+
+ ppt_unassign_all(vm);
+
+ for (i = 0; i < vm->num_mem_segs; i++)
+ vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+ vm->num_mem_segs = 0;
+
+ for (i = 0; i < VM_MAXCPU; i++)
+ vcpu_cleanup(&vm->vcpu[i]);
+
+ iommu_destroy_domain(vm->iommu);
+
+ VMCLEANUP(vm->cookie);
+
+ free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+ return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+ VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+ VM_PROT_NONE, spok));
+}
+
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+ int i;
+ vm_paddr_t gpabase, gpalimit;
+
+ if (gpa & PAGE_MASK)
+ panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ gpabase = vm->mem_segs[i].gpa;
+ gpalimit = gpabase + vm->mem_segs[i].len;
+ if (gpa >= gpabase && gpa < gpalimit)
+ return (FALSE);
+ }
+
+ return (TRUE);
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ int error, available, allocated;
+ struct vm_memory_segment *seg;
+ vm_paddr_t g, hpa;
+ void *host_domain;
+
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+ return (EINVAL);
+
+ available = allocated = 0;
+ g = gpa;
+ while (g < gpa + len) {
+ if (vm_gpa_available(vm, g))
+ available++;
+ else
+ allocated++;
+
+ g += PAGE_SIZE;
+ }
+
+ /*
+ * If there are some allocated and some available pages in the address
+ * range then it is an error.
+ */
+ if (allocated && available)
+ return (EINVAL);
+
+ /*
+ * If the entire address range being requested has already been
+ * allocated then there isn't anything more to do.
+ */
+ if (allocated && available == 0)
+ return (0);
+
+ if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+ return (E2BIG);
+
+ host_domain = iommu_host_domain();
+
+ seg = &vm->mem_segs[vm->num_mem_segs];
+
+ error = 0;
+ seg->gpa = gpa;
+ seg->len = 0;
+ while (seg->len < len) {
+ hpa = vmm_mem_alloc(PAGE_SIZE);
+ if (hpa == 0) {
+ error = ENOMEM;
+ break;
+ }
+
+ error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+ VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+ if (error)
+ break;
+
+ /*
+ * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+ * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+ */
+ iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
+ iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+
+ seg->len += PAGE_SIZE;
+ }
+
+ if (error) {
+ vm_free_mem_seg(vm, seg);
+ return (error);
+ }
+
+ /*
+ * Invalidate cached translations associated with 'host_domain' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(host_domain);
+
+ vm->num_mem_segs++;
+
+ return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ vm_paddr_t nextpage;
+
+ nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+ if (len > nextpage - gpa)
+ panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+ return (VMMMAP_GET(vm->cookie, gpa));
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg)
+{
+ int i;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ if (gpabase == vm->mem_segs[i].gpa) {
+ *seg = vm->mem_segs[i];
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_IDTR:
+ case VM_REG_GUEST_GDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_ES:
+ case VM_REG_GUEST_CS:
+ case VM_REG_GUEST_SS:
+ case VM_REG_GUEST_DS:
+ case VM_REG_GUEST_FS:
+ case VM_REG_GUEST_GS:
+ case VM_REG_GUEST_TR:
+ case VM_REG_GUEST_LDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
+{
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ *cpuid = VCPU_PINCPU(vm, vcpuid);
+
+ return (0);
+}
+
+int
+vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
+{
+ struct thread *td;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ td = curthread; /* XXXSMP only safe when muxing vcpus */
+
+ /* unpin */
+ if (host_cpuid < 0) {
+ VCPU_UNPIN(vm, vcpuid);
+ thread_lock(td);
+ sched_unbind(td);
+ thread_unlock(td);
+ return (0);
+ }
+
+ if (CPU_ABSENT(host_cpuid))
+ return (EINVAL);
+
+ /*
+ * XXX we should check that 'host_cpuid' has not already been pinned
+ * by another vm.
+ */
+ thread_lock(td);
+ sched_bind(td, host_cpuid);
+ thread_unlock(td);
+ VCPU_PIN(vm, vcpuid, host_cpuid);
+
+ return (0);
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+
+ /* flush host state to the pcb */
+ fpuexit(curthread);
+
+ /* restore guest FPU state */
+ fpu_stop_emulating();
+ fpurestore(vcpu->guestfpu);
+
+ /*
+ * The FPU is now "dirty" with the guest's state so turn on emulation
+ * to trap any access to the FPU by the host.
+ */
+ fpu_start_emulating();
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+
+ if ((rcr0() & CR0_TS) == 0)
+ panic("fpu emulation not enabled in host!");
+
+ /* save guest FPU state */
+ fpu_stop_emulating();
+ fpusave(vcpu->guestfpu);
+ fpu_start_emulating();
+}
+
+static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+ int error, vcpuid, sleepticks, t;
+ struct vcpu *vcpu;
+ struct pcb *pcb;
+ uint64_t tscval, rip;
+ struct vm_exit *vme;
+
+ vcpuid = vmrun->cpuid;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vmrun->vm_exit;
+ rip = vmrun->rip;
+restart:
+ critical_enter();
+
+ tscval = rdtsc();
+
+ pcb = PCPU_GET(curpcb);
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+
+ restore_guest_msrs(vm, vcpuid);
+ restore_guest_fpustate(vcpu);
+
+ vcpu->hostcpu = curcpu;
+ error = VMRUN(vm->cookie, vcpuid, rip);
+ vcpu->hostcpu = NOCPU;
+
+ save_guest_fpustate(vcpu);
+ restore_host_msrs(vm, vcpuid);
+
+ vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+ /* copy the exit information */
+ bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
+
+ critical_exit();
+
+ /*
+ * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
+ * is ready to run.
+ */
+ if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
+ vcpu_lock(vcpu);
+
+ /*
+ * Figure out the number of host ticks until the next apic
+ * timer interrupt in the guest.
+ */
+ sleepticks = lapic_timer_tick(vm, vcpuid);
+
+ /*
+ * If the guest local apic timer is disabled then sleep for
+ * a long time but not forever.
+ */
+ if (sleepticks < 0)
+ sleepticks = hz;
+
+ /*
+ * Do a final check for pending NMI or interrupts before
+ * really putting this thread to sleep.
+ *
+ * These interrupts could have happened any time after we
+ * returned from VMRUN() and before we grabbed the vcpu lock.
+ */
+ if (!vm_nmi_pending(vm, vcpuid) &&
+ lapic_pending_intr(vm, vcpuid) < 0) {
+ if (sleepticks <= 0)
+ panic("invalid sleepticks %d", sleepticks);
+ t = ticks;
+ msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+ vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ }
+
+ vcpu_unlock(vcpu);
+
+ rip = vme->rip + vme->inst_length;
+ goto restart;
+ }
+
+ return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+ int vector, uint32_t code, int code_valid)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+ return (EINVAL);
+
+ if (vector < 0 || vector > 255)
+ return (EINVAL);
+
+ return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
+int
+vm_inject_nmi(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu->nmi_pending = 1;
+ vm_interrupt_hostcpu(vm, vcpuid);
+ return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpu->nmi_pending == 0)
+ panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+ vcpu->nmi_pending = 0;
+ vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+ int found, b, s, f, n;
+ char *val, *cp, *cp2;
+
+ /*
+ * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+ */
+ found = 0;
+ cp = val = getenv("pptdevs");
+ while (cp != NULL && *cp != '\0') {
+ if ((cp2 = strchr(cp, ' ')) != NULL)
+ *cp2 = '\0';
+
+ n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+ if (n == 3 && bus == b && slot == s && func == f) {
+ found = 1;
+ break;
+ }
+
+ if (cp2 != NULL)
+ *cp2++ = ' ';
+
+ cp = cp2;
+ }
+ freeenv(val);
+ return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+ return (vm->iommu);
+}
+
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+{
+ int error;
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+
+ /*
+ * The following state transitions are allowed:
+ * IDLE -> RUNNING -> IDLE
+ * IDLE -> CANNOT_RUN -> IDLE
+ */
+ if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
+ (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
+ error = 0;
+ vcpu->state = state;
+ } else {
+ error = EBUSY;
+ }
+
+ vcpu_unlock(vcpu);
+
+ return (error);
+}
+
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+ enum vcpu_state state;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ state = vcpu->state;
+ vcpu_unlock(vcpu);
+
+ return (state);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+ if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+ CPU_SET(vcpuid, &vm->active_cpus);
+}
+
+cpuset_t
+vm_active_cpus(struct vm *vm)
+{
+
+ return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+ return (vm->vcpu[vcpuid].stats);
+}
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ *state = vm->vcpu[vcpuid].x2apic_state;
+
+ return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (state < 0 || state >= X2APIC_STATE_LAST)
+ return (EINVAL);
+
+ vm->vcpu[vcpuid].x2apic_state = state;
+
+ vlapic_set_x2apic_state(vm, vcpuid, state);
+
+ return (0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
+{
+ int hostcpu;
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ hostcpu = vcpu->hostcpu;
+ if (hostcpu == NOCPU) {
+ /*
+ * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
+ * the host thread must be sleeping waiting for an event to
+ * kick the vcpu out of 'hlt'.
+ *
+ * XXX this is racy because the condition exists right before
+ * and after calling VMRUN() in vm_run(). The wakeup() is
+ * benign in this case.
+ */
+ if (vcpu->state == VCPU_RUNNING)
+ wakeup_one(vcpu);
+ } else {
+ if (vcpu->state != VCPU_RUNNING)
+ panic("invalid vcpu state %d", vcpu->state);
+ if (hostcpu != curcpu)
+ ipi_cpu(hostcpu, vmm_ipinum);
+ }
+ vcpu_unlock(vcpu);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 0000000..0150ebd
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,538 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_mem.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+ struct vm *vm; /* vm instance cookie */
+ struct cdev *cdev;
+ SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+ struct vmmdev_softc *sc;
+
+#ifdef notyet /* XXX kernel is not compiled with invariants */
+ mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+ SLIST_FOREACH(sc, &head, link) {
+ if (strcmp(name, vm_name(sc->vm)) == 0)
+ break;
+ }
+
+ return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+
+ return (cdev->si_drv1);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+ int error, off, c;
+ vm_paddr_t hpa, gpa;
+ struct vmmdev_softc *sc;
+
+ static char zerobuf[PAGE_SIZE];
+
+ error = 0;
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup2(cdev);
+ if (sc == NULL)
+ error = ENXIO;
+
+ while (uio->uio_resid > 0 && error == 0) {
+ gpa = uio->uio_offset;
+ off = gpa & PAGE_MASK;
+ c = min(uio->uio_resid, PAGE_SIZE - off);
+
+ /*
+ * The VM has a hole in its physical memory map. If we want to
+ * use 'dd' to inspect memory beyond the hole we need to
+ * provide bogus data for memory that lies in the hole.
+ *
+ * Since this device does not support lseek(2), dd(1) will
+ * read(2) blocks of data to simulate the lseek(2).
+ */
+ hpa = vm_gpa2hpa(sc->vm, gpa, c);
+ if (hpa == (vm_paddr_t)-1) {
+ if (uio->uio_rw == UIO_READ)
+ error = uiomove(zerobuf, c, uio);
+ else
+ error = EFAULT;
+ } else
+ error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+ return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+ struct thread *td)
+{
+ int error, vcpu, state_changed;
+ enum vcpu_state new_state;
+ struct vmmdev_softc *sc;
+ struct vm_memory_segment *seg;
+ struct vm_register *vmreg;
+ struct vm_seg_desc* vmsegdesc;
+ struct vm_pin *vmpin;
+ struct vm_run *vmrun;
+ struct vm_event *vmevent;
+ struct vm_lapic_irq *vmirq;
+ struct vm_capability *vmcap;
+ struct vm_pptdev *pptdev;
+ struct vm_pptdev_mmio *pptmmio;
+ struct vm_pptdev_msi *pptmsi;
+ struct vm_pptdev_msix *pptmsix;
+ struct vm_nmi *vmnmi;
+ struct vm_stats *vmstats;
+ struct vm_stat_desc *statdesc;
+ struct vm_x2apic *x2apic;
+
+ sc = vmmdev_lookup2(cdev);
+ if (sc == NULL)
+ return (ENXIO);
+
+ vcpu = -1;
+ state_changed = 0;
+
+ /*
+ * Some VMM ioctls can operate only on vcpus that are not running.
+ */
+ switch (cmd) {
+ case VM_RUN:
+ case VM_SET_PINNING:
+ case VM_GET_REGISTER:
+ case VM_SET_REGISTER:
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ case VM_INJECT_EVENT:
+ case VM_GET_CAPABILITY:
+ case VM_SET_CAPABILITY:
+ case VM_PPTDEV_MSI:
+ case VM_PPTDEV_MSIX:
+ case VM_SET_X2APIC_STATE:
+ /*
+ * XXX fragile, handle with care
+ * Assumes that the first field of the ioctl data is the vcpu.
+ */
+ vcpu = *(int *)data;
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ error = EINVAL;
+ goto done;
+ }
+
+ if (cmd == VM_RUN)
+ new_state = VCPU_RUNNING;
+ else
+ new_state = VCPU_CANNOT_RUN;
+
+ error = vcpu_set_state(sc->vm, vcpu, new_state);
+ if (error)
+ goto done;
+
+ state_changed = 1;
+ break;
+
+ case VM_MAP_PPTDEV_MMIO:
+ case VM_BIND_PPTDEV:
+ case VM_UNBIND_PPTDEV:
+ case VM_MAP_MEMORY:
+ /*
+ * ioctls that operate on the entire virtual machine must
+ * prevent all vcpus from running.
+ */
+ error = 0;
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+ error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+ if (error)
+ break;
+ }
+
+ if (error) {
+ while (--vcpu >= 0)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ goto done;
+ }
+
+ state_changed = 2;
+ break;
+
+ default:
+ break;
+ }
+
+ switch(cmd) {
+ case VM_RUN:
+ vmrun = (struct vm_run *)data;
+ error = vm_run(sc->vm, vmrun);
+ break;
+ case VM_STAT_DESC: {
+ const char *desc;
+ statdesc = (struct vm_stat_desc *)data;
+ desc = vmm_stat_desc(statdesc->index);
+ if (desc != NULL) {
+ error = 0;
+ strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+ } else
+ error = EINVAL;
+ break;
+ }
+ case VM_STATS: {
+ CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+ vmstats = (struct vm_stats *)data;
+ getmicrotime(&vmstats->tv);
+ error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+ &vmstats->num_entries, vmstats->statbuf);
+ break;
+ }
+ case VM_PPTDEV_MSI:
+ pptmsi = (struct vm_pptdev_msi *)data;
+ error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+ pptmsi->bus, pptmsi->slot, pptmsi->func,
+ pptmsi->destcpu, pptmsi->vector,
+ pptmsi->numvec);
+ break;
+ case VM_PPTDEV_MSIX:
+ pptmsix = (struct vm_pptdev_msix *)data;
+ error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+ pptmsix->bus, pptmsix->slot,
+ pptmsix->func, pptmsix->idx,
+ pptmsix->msg, pptmsix->vector_control,
+ pptmsix->addr);
+ break;
+ case VM_MAP_PPTDEV_MMIO:
+ pptmmio = (struct vm_pptdev_mmio *)data;
+ error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+ pptmmio->func, pptmmio->gpa, pptmmio->len,
+ pptmmio->hpa);
+ break;
+ case VM_BIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_UNBIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_INJECT_EVENT:
+ vmevent = (struct vm_event *)data;
+ error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+ vmevent->vector,
+ vmevent->error_code,
+ vmevent->error_code_valid);
+ break;
+ case VM_INJECT_NMI:
+ vmnmi = (struct vm_nmi *)data;
+ error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+ break;
+ case VM_LAPIC_IRQ:
+ vmirq = (struct vm_lapic_irq *)data;
+ error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+ break;
+ case VM_SET_PINNING:
+ vmpin = (struct vm_pin *)data;
+ error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
+ vmpin->host_cpuid);
+ break;
+ case VM_GET_PINNING:
+ vmpin = (struct vm_pin *)data;
+ error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
+ &vmpin->host_cpuid);
+ break;
+ case VM_MAP_MEMORY:
+ seg = (struct vm_memory_segment *)data;
+ error = vm_malloc(sc->vm, seg->gpa, seg->len);
+ break;
+ case VM_GET_MEMORY_SEG:
+ seg = (struct vm_memory_segment *)data;
+ seg->len = 0;
+ (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+ error = 0;
+ break;
+ case VM_GET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ &vmreg->regval);
+ break;
+ case VM_SET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ vmreg->regval);
+ break;
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_get_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ &vmcap->capval);
+ break;
+ case VM_SET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_set_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ vmcap->capval);
+ break;
+ case VM_SET_X2APIC_STATE:
+ x2apic = (struct vm_x2apic *)data;
+ error = vm_set_x2apic_state(sc->vm,
+ x2apic->cpuid, x2apic->state);
+ break;
+ case VM_GET_X2APIC_STATE:
+ x2apic = (struct vm_x2apic *)data;
+ error = vm_get_x2apic_state(sc->vm,
+ x2apic->cpuid, &x2apic->state);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ if (state_changed == 1) {
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ } else if (state_changed == 2) {
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+ }
+
+done:
+ return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+ int error;
+ struct vmmdev_softc *sc;
+
+ error = -1;
+ mtx_lock(&vmmdev_mtx);
+
+ sc = vmmdev_lookup2(cdev);
+ if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+ *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+ if (*paddr != (vm_paddr_t)-1)
+ error = 0;
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+
+ return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink)
+{
+
+ /*
+ * XXX must stop virtual machine instances that may be still
+ * running and cleanup their state.
+ */
+ if (sc->cdev)
+ destroy_dev(sc->cdev);
+
+ if (sc->vm)
+ vm_destroy(sc->vm);
+
+ if (unlink) {
+ mtx_lock(&vmmdev_mtx);
+ SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+ mtx_unlock(&vmmdev_mtx);
+ }
+
+ free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ char buf[VM_MAX_NAMELEN];
+ struct vmmdev_softc *sc;
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ /*
+ * XXX TODO if any process has this device open then fail
+ */
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup(buf);
+ if (sc == NULL) {
+ mtx_unlock(&vmmdev_mtx);
+ return (EINVAL);
+ }
+
+ sc->cdev->si_drv1 = NULL;
+ mtx_unlock(&vmmdev_mtx);
+
+ vmmdev_destroy(sc, TRUE);
+
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+ .d_name = "vmmdev",
+ .d_version = D_VERSION,
+ .d_ioctl = vmmdev_ioctl,
+ .d_mmap = vmmdev_mmap,
+ .d_read = vmmdev_rw,
+ .d_write = vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct vm *vm;
+ struct vmmdev_softc *sc, *sc2;
+ char buf[VM_MAX_NAMELEN];
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup(buf);
+ mtx_unlock(&vmmdev_mtx);
+ if (sc != NULL)
+ return (EEXIST);
+
+ vm = vm_create(buf);
+ if (vm == NULL)
+ return (EINVAL);
+
+ sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+ sc->vm = vm;
+
+ /*
+ * Lookup the name again just in case somebody sneaked in when we
+ * dropped the lock.
+ */
+ mtx_lock(&vmmdev_mtx);
+ sc2 = vmmdev_lookup(buf);
+ if (sc2 == NULL)
+ SLIST_INSERT_HEAD(&head, sc, link);
+ mtx_unlock(&vmmdev_mtx);
+
+ if (sc2 != NULL) {
+ vmmdev_destroy(sc, FALSE);
+ return (EEXIST);
+ }
+
+ sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ "vmm/%s", buf);
+ sc->cdev->si_drv1 = sc;
+
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+ mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+int
+vmmdev_cleanup(void)
+{
+ int error;
+
+ if (SLIST_EMPTY(&head))
+ error = 0;
+ else
+ error = EBUSY;
+
+ return (error);
+}
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
new file mode 100644
index 0000000..8dfef73
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+ vmm_host_efer = rdmsr(MSR_EFER);
+ vmm_host_pat = rdmsr(MSR_PAT);
+
+ /*
+ * We always want CR0.TS to be set when the processor does a VM exit.
+ *
+ * With emulation turned on unconditionally after a VM exit, we are
+ * able to trap inadvertent use of the FPU until the guest FPU state
+ * has been safely squirreled away.
+ */
+ vmm_host_cr0 = rcr0() | CR0_TS;
+
+ vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+ return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+ return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+ return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+ return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+ return (GSEL(GDATA_SEL, SEL_KPL));
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+ return (GSEL(GCODE_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+ return (GSEL(GPROC0_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+ return (0);
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+ return (r_idt.rd_base);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
new file mode 100644
index 0000000..839f54a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_HOST_H_
+#define _VMM_HOST_H_
+
+#ifndef _KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+ return ((uint64_t)PCPU_GET(tssp));
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+ return ((uint64_t)&gdt[NGDT * curcpu]);
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+ return ((uint64_t)&__pcpu[curcpu]);
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000..e73f6bb
--- /dev/null
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -0,0 +1,810 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif /* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+ [0x89] = {
+ .op_byte = 0x89,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8B] = {
+ .op_byte = 0x8B,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0xC7] = {
+ .op_byte = 0xC7,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x23] = {
+ .op_byte = 0x23,
+ .op_type = VIE_OP_TYPE_AND,
+ },
+ [0x81] = {
+ /* XXX Group 1 extended opcode - not just AND */
+ .op_byte = 0x81,
+ .op_type = VIE_OP_TYPE_AND,
+ .op_flags = VIE_OP_F_IMM,
+ }
+};
+
+/* struct vie.mod */
+#define VIE_MOD_INDIRECT 0
+#define VIE_MOD_INDIRECT_DISP8 1
+#define VIE_MOD_INDIRECT_DISP32 2
+#define VIE_MOD_DIRECT 3
+
+/* struct vie.rm */
+#define VIE_RM_SIB 4
+#define VIE_RM_DISP32 5
+
+#define GB (1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15
+};
+
+static uint64_t size2mask[] = {
+ [1] = 0xff,
+ [2] = 0xffff,
+ [4] = 0xffffffff,
+ [8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+ /*
+ * XXX
+ * The operand register in which we store the result of the
+ * read must be a GPR that we can modify even if the vcpu
+ * is "running". All the GPRs qualify except for %rsp.
+ *
+ * This is a limitation of the vm_set_register() API
+ * and can be fixed if necessary.
+ */
+ if (reg == VM_REG_GUEST_RSP)
+ return (0);
+#endif
+ return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+ int error;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ error = vm_get_register(vm, vcpuid, reg, rval);
+
+ return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size)
+{
+ int error;
+ uint64_t origval;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ switch (size) {
+ case 1:
+ case 2:
+ error = vie_read_register(vm, vcpuid, reg, &origval);
+ if (error)
+ return (error);
+ val &= size2mask[size];
+ val |= origval & ~size2mask[size];
+ break;
+ case 4:
+ val &= 0xffffffffUL;
+ break;
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ error = vm_set_register(vm, vcpuid, reg, val);
+ return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ * - default address size is 64-bits
+ * - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x89:
+ /*
+ * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m32, r32
+ * REX.W + 89/r mov r/m64, r64
+ */
+ if (vie->rex_w)
+ size = 8;
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0x8B:
+ /*
+ * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8B/r: mov r32, r/m32
+ * REX.W 8B/r: mov r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = gpr_map[vie->reg];
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xC7:
+ /*
+ * MOV from imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m32, imm32
+ * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
+ */
+ val = vie->immediate; /* already sign-extended */
+
+ if (vie->rex_w)
+ size = 8;
+
+ if (size != 8)
+ val &= size2mask[size];
+
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ break;
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val1, val2;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x23:
+ /*
+ * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+ * result in reg.
+ *
+ * 23/r and r32, r/m32
+ * REX.W + 23/r and r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ val1 &= val2;
+ error = vie_update_register(vm, vcpuid, reg, val1, size);
+ break;
+ case 0x81:
+ /*
+ * AND reg (ModRM:reg) with immediate and store the
+ * result in reg
+ *
+ * 81/ and r/m32, imm32
+ * REX.W + 81/ and r/m64, imm32 sign-extended to 64
+ *
+ * Currently, only the AND operation of the 0x81 opcode
+ * is implemented (ModRM:reg = b100).
+ */
+ if ((vie->reg & 7) != 4)
+ break;
+
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &val1, size, arg);
+ if (error)
+ break;
+
+ /*
+ * perform the operation with the pre-fetched immediate
+ * operand and write the result
+ */
+ val1 &= vie->immediate;
+ error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite,
+ void *memarg)
+{
+ int error;
+
+ if (!vie->decoded)
+ return (EINVAL);
+
+ switch (vie->op.op_type) {
+ case VIE_OP_TYPE_MOV:
+ error = emulate_mov(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_AND:
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef _KERNEL
+static void
+vie_init(struct vie *vie)
+{
+
+ bzero(vie, sizeof(struct vie));
+
+ vie->base_register = VM_REG_LAST;
+ vie->index_register = VM_REG_LAST;
+}
+
+static int
+gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
+ uint64_t *gpa, uint64_t *gpaend)
+{
+ vm_paddr_t hpa;
+ int nlevels, ptpshift, ptpindex;
+ uint64_t *ptpbase, pte, pgsize;
+
+ /*
+ * XXX assumes 64-bit guest with 4 page walk levels
+ */
+ nlevels = 4;
+ while (--nlevels >= 0) {
+ /* Zero out the lower 12 bits and the upper 12 bits */
+ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+ hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
+ if (hpa == -1)
+ goto error;
+
+ ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
+
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gla >> ptpshift) & 0x1FF;
+ pgsize = 1UL << ptpshift;
+
+ pte = ptpbase[ptpindex];
+
+ if ((pte & PG_V) == 0)
+ goto error;
+
+ if (pte & PG_PS) {
+ if (pgsize > 1 * GB)
+ goto error;
+ else
+ break;
+ }
+
+ ptpphys = pte;
+ }
+
+ /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+ *gpa = pte | (gla & (pgsize - 1));
+ *gpaend = pte + pgsize;
+ return (0);
+
+error:
+ return (-1);
+}
+
+int
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
+ uint64_t cr3, struct vie *vie)
+{
+ int n, err;
+ uint64_t hpa, gpa, gpaend, off;
+
+ /*
+ * XXX cache previously fetched instructions using 'rip' as the tag
+ */
+
+ if (inst_length > VIE_INST_SIZE)
+ panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
+ vie_init(vie);
+
+ /* Copy the instruction into 'vie' */
+ while (vie->num_valid < inst_length) {
+ err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
+ if (err)
+ break;
+
+ off = gpa & PAGE_MASK;
+ n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
+
+ hpa = vm_gpa2hpa(vm, gpa, n);
+ if (hpa == -1)
+ break;
+
+ bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+
+ rip += n;
+ vie->num_valid += n;
+ }
+
+ if (vie->num_valid == inst_length)
+ return (0);
+ else
+ return (-1);
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+
+ if (vie->num_processed < vie->num_valid) {
+ *x = vie->inst[vie->num_processed];
+ return (0);
+ } else
+ return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+
+ vie->num_processed++;
+}
+
+static int
+decode_rex(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ if (x >= 0x40 && x <= 0x4F) {
+ vie->rex_w = x & 0x8 ? 1 : 0;
+ vie->rex_r = x & 0x4 ? 1 : 0;
+ vie->rex_x = x & 0x2 ? 1 : 0;
+ vie->rex_b = x & 0x1 ? 1 : 0;
+
+ vie_advance(vie);
+ }
+
+ return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->op = one_byte_opcodes[x];
+
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
+ return (-1);
+
+ vie_advance(vie);
+ return (0);
+}
+
+/*
+ * XXX assuming 32-bit or 64-bit guest
+ */
+static int
+decode_modrm(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->mod = (x >> 6) & 0x3;
+ vie->rm = (x >> 0) & 0x7;
+ vie->reg = (x >> 3) & 0x7;
+
+ /*
+ * A direct addressing mode makes no sense in the context of an EPT
+ * fault. There has to be a memory access involved to cause the
+ * EPT fault.
+ */
+ if (vie->mod == VIE_MOD_DIRECT)
+ return (-1);
+
+ if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+ (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+ /*
+ * Table 2-5: Special Cases of REX Encodings
+ *
+ * mod=0, r/m=5 is used in the compatibility mode to
+ * indicate a disp32 without a base register.
+ *
+ * mod!=3, r/m=4 is used in the compatibility mode to
+ * indicate that the SIB byte is present.
+ *
+ * The 'b' bit in the REX prefix is don't care in
+ * this case.
+ */
+ } else {
+ vie->rm |= (vie->rex_b << 3);
+ }
+
+ vie->reg |= (vie->rex_r << 3);
+
+ /* SIB */
+ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+ goto done;
+
+ vie->base_register = gpr_map[vie->rm];
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ case VIE_MOD_INDIRECT:
+ if (vie->rm == VIE_RM_DISP32) {
+ vie->disp_bytes = 4;
+ vie->base_register = VM_REG_LAST; /* no base */
+ }
+ break;
+ }
+
+ /* Figure out immediate operand size (if any) */
+ if (vie->op.op_flags & VIE_OP_F_IMM)
+ vie->imm_bytes = 4;
+ else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ vie->imm_bytes = 1;
+
+done:
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+ uint8_t x;
+
+ /* Proceed only if SIB byte is present */
+ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+ return (0);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ /* De-construct the SIB byte */
+ vie->ss = (x >> 6) & 0x3;
+ vie->index = (x >> 3) & 0x7;
+ vie->base = (x >> 0) & 0x7;
+
+ /* Apply the REX prefix modifiers */
+ vie->index |= vie->rex_x << 3;
+ vie->base |= vie->rex_b << 3;
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ }
+
+ if (vie->mod == VIE_MOD_INDIRECT &&
+ (vie->base == 5 || vie->base == 13)) {
+ /*
+ * Special case when base register is unused if mod = 0
+ * and base = %rbp or %r13.
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ vie->disp_bytes = 4;
+ } else {
+ vie->base_register = gpr_map[vie->base];
+ }
+
+ /*
+ * All encodings of 'index' are valid except for %rsp (4).
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ if (vie->index != 4)
+ vie->index_register = gpr_map[vie->index];
+
+ /* 'scale' makes sense only in the context of an index register */
+ if (vie->index_register < VM_REG_LAST)
+ vie->scale = 1 << vie->ss;
+
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+ int n, i;
+ uint8_t x;
+
+ union {
+ char buf[4];
+ int8_t signed8;
+ int32_t signed32;
+ } u;
+
+ if ((n = vie->disp_bytes) == 0)
+ return (0);
+
+ if (n != 1 && n != 4)
+ panic("decode_displacement: invalid disp_bytes %d", n);
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ if (n == 1)
+ vie->displacement = u.signed8; /* sign-extended */
+ else
+ vie->displacement = u.signed32; /* sign-extended */
+
+ return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+ int i, n;
+ uint8_t x;
+ union {
+ char buf[4];
+ int8_t signed8;
+ int32_t signed32;
+ } u;
+
+ if ((n = vie->imm_bytes) == 0)
+ return (0);
+
+ if (n != 1 && n != 4)
+ panic("decode_immediate: invalid imm_bytes %d", n);
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ if (n == 1)
+ vie->immediate = u.signed8; /* sign-extended */
+ else
+ vie->immediate = u.signed32; /* sign-extended */
+
+ return (0);
+}
+
+#define VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+ int error;
+ uint64_t base, idx;
+
+ base = 0;
+ if (vie->base_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->base_register, &base);
+ if (error) {
+ printf("verify_gla: error %d getting base reg %d\n",
+ error, vie->base_register);
+ return (-1);
+ }
+ }
+
+ idx = 0;
+ if (vie->index_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+ if (error) {
+ printf("verify_gla: error %d getting index reg %d\n",
+ error, vie->index_register);
+ return (-1);
+ }
+ }
+
+ if (base + vie->scale * idx + vie->displacement != gla) {
+ printf("verify_gla mismatch: "
+ "base(0x%0lx), scale(%d), index(0x%0lx), "
+ "disp(0x%0lx), gla(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla);
+ return (-1);
+ }
+
+ return (0);
+}
+#endif /* VERIFY_GLA */
+
+int
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+
+ if (decode_rex(vie))
+ return (-1);
+
+ if (decode_opcode(vie))
+ return (-1);
+
+ if (decode_modrm(vie))
+ return (-1);
+
+ if (decode_sib(vie))
+ return (-1);
+
+ if (decode_displacement(vie))
+ return (-1);
+
+ if (decode_immediate(vie))
+ return (-1);
+
+#ifdef VERIFY_GLA
+ if (verify_gla(vm, cpuid, gla, vie))
+ return (-1);
+#endif
+
+ vie->decoded = 1; /* success */
+
+ return (0);
+}
+#endif /* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 0000000..643d326
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+int vmm_ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+ int idx;
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ /*
+ * Search backwards from the highest IDT vector available for use
+ * as our IPI vector. We install the 'justreturn' handler at that
+ * vector and use it to interrupt the vcpus.
+ *
+ * We do this because the IPI_AST is heavyweight and saves all
+ * registers in the trapframe. This is overkill for our use case
+ * which is simply to EOI the interrupt and return.
+ */
+ idx = APIC_SPURIOUS_INT;
+ while (--idx >= APIC_IPI_INTS) {
+ ip = &idt[idx];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func == (uintptr_t)&IDTVEC(rsvd)) {
+ vmm_ipinum = idx;
+ setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+ SEL_KPL, 0);
+ break;
+ }
+ }
+
+ if (vmm_ipinum != IPI_AST && bootverbose) {
+ printf("vmm_ipi_init: installing ipi handler to interrupt "
+ "vcpus at vector %d\n", vmm_ipinum);
+ }
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+ if (vmm_ipinum != IPI_AST)
+ setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 0000000..91552e3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+extern int vmm_ipinum;
+
+void vmm_ipi_init(void);
+void vmm_ipi_cleanup(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 0000000..e691c61
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define _VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define KTR_VMM KTR_GEN
+
+#define VMM_CTR0(vm, vcpuid, format) \
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define VMM_CTR1(vm, vcpuid, format, p1) \
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1))
+
+#define VMM_CTR2(vm, vcpuid, format, p1, p2) \
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2))
+
+#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 0000000..d024b71
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ if (cpu < 0 || cpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (vector < 32 || vector > 255)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ vlapic_set_intr_ready(vlapic, vector);
+
+ vm_interrupt_hostcpu(vm, cpu);
+
+ return (0);
+}
+
+int
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ return (vlapic_timer_tick(vlapic));
+}
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+ if (msr >= 0x800 && msr <= 0xBFF)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+ return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+ if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ *rval = vlapic_get_apicbase(vlapic);
+ error = 0;
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_op_mem_read(vlapic, offset, DWORD, rval);
+ }
+
+ return (error);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ vlapic_set_apicbase(vlapic, val);
+ error = 0;
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_op_mem_write(vlapic, offset, DWORD, val);
+ }
+
+ return (error);
+}
+
+int
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+ return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+ return (error);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 0000000..a79912e
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define _VMM_LAPIC_H_
+
+struct vm;
+
+boolean_t lapic_msr(u_int num);
+int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
+int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
+
+int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+ uint64_t *rval, int size, void *arg);
+int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+ uint64_t wval, int size, void *arg);
+
+int lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 0000000..04f99b1
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,135 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+SYSCTL_DECL(_hw_vmm);
+
+static u_long pages_allocated;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
+ &pages_allocated, 0, "4KB pages allocated");
+
+static void
+update_pages_allocated(int howmany)
+{
+ pages_allocated += howmany; /* XXX locking? */
+}
+
+int
+vmm_mem_init(void)
+{
+
+ return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+ int flags;
+ vm_page_t m;
+ vm_paddr_t pa;
+
+ if (size != PAGE_SIZE)
+ panic("vmm_mem_alloc: invalid allocation size %lu", size);
+
+ flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
+ VM_ALLOC_ZERO;
+
+ while (1) {
+ /*
+ * XXX need policy to determine when to back off the allocation
+ */
+ m = vm_page_alloc(NULL, 0, flags);
+ if (m == NULL)
+ VM_WAIT;
+ else
+ break;
+ }
+
+ pa = VM_PAGE_TO_PHYS(m);
+
+ if ((m->flags & PG_ZERO) == 0)
+ pagezero((void *)PHYS_TO_DMAP(pa));
+ m->valid = VM_PAGE_BITS_ALL;
+
+ update_pages_allocated(1);
+
+ return (pa);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+ vm_page_t m;
+
+ if (base & PAGE_MASK) {
+ panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+ "0x%0x boundary\n", base, PAGE_SIZE);
+ }
+
+ if (length != PAGE_SIZE)
+ panic("vmm_mem_free: invalid length %lu", length);
+
+ m = PHYS_TO_VM_PAGE(base);
+ m->wire_count--;
+ vm_page_free(m);
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+
+ update_pages_allocated(-1);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+ return (ptoa(Maxmem));
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 0000000..7d45c74
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MEM_H_
+#define _VMM_MEM_H_
+
+int vmm_mem_init(void);
+vm_paddr_t vmm_mem_alloc(size_t size);
+void vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t vmm_mem_maxaddr(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 0000000..d97c819
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,254 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define VMM_MSR_F_EMULATE 0x01
+#define VMM_MSR_F_READONLY 0x02
+#define VMM_MSR_F_INVALID 0x04 /* guest_msr_valid() can override this */
+
+struct vmm_msr {
+ int num;
+ int flags;
+ uint64_t hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+ { MSR_LSTAR, 0 },
+ { MSR_CSTAR, 0 },
+ { MSR_STAR, 0 },
+ { MSR_SF_MASK, 0 },
+ { MSR_PAT, VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
+ { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+ { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define readonly_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define emulated_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+#define invalid_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
+
+void
+vmm_msr_init(void)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ /*
+ * XXX this assumes that the value of the host msr does not
+ * change after we have cached it.
+ */
+ vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+ }
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ switch (vmm_msr[i].num) {
+ case MSR_LSTAR:
+ case MSR_CSTAR:
+ case MSR_STAR:
+ case MSR_SF_MASK:
+ case MSR_BIOS_SIGN:
+ case MSR_MCG_CAP:
+ guest_msrs[i] = 0;
+ break;
+ case MSR_PAT:
+ guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ break;
+ default:
+ panic("guest_msrs_init: missing initialization for msr "
+ "0x%0x", vmm_msr[i].num);
+ }
+ }
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++)
+ if (vmm_msr[i].num == num)
+ return (i);
+
+ return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+ int idx;
+ uint64_t *guest_msrs;
+
+ if (lapic_msr(num))
+ return (lapic_wrmsr(vm, cpu, num, val));
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0 || invalid_msr(idx))
+ return (EINVAL);
+
+ if (!readonly_msr(idx)) {
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ /* Stash the value */
+ guest_msrs[idx] = val;
+
+ /* Update processor state for non-emulated MSRs */
+ if (!emulated_msr(idx))
+ wrmsr(vmm_msr[idx].num, val);
+ }
+
+ return (0);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+ int error, idx;
+ uint32_t eax, edx;
+ uint64_t result, *guest_msrs;
+
+ if (lapic_msr(num)) {
+ error = lapic_rdmsr(vm, cpu, num, &result);
+ goto done;
+ }
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0 || invalid_msr(idx)) {
+ error = EINVAL;
+ goto done;
+ }
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+ result = guest_msrs[idx];
+
+ /*
+ * If this is not an emulated msr register make sure that the processor
+ * state matches our cached state.
+ */
+ if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+ panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+ "(0x%016lx) and actual (0x%016lx) values", num,
+ result, rdmsr(num));
+ }
+
+ error = 0;
+
+done:
+ if (error == 0) {
+ eax = result;
+ edx = result >> 32;
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+ if (error)
+ panic("vm_set_register(rax) error %d", error);
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+ if (error)
+ panic("vm_set_register(rdx) error %d", error);
+ }
+ return (error);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, guest_msrs[i]);
+ }
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+ }
+}
+
+/*
+ * Must be called by the CPU-specific code before any guests are
+ * created
+ */
+void
+guest_msr_valid(int msr)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (vmm_msr[i].num == msr && invalid_msr(i)) {
+ vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
+ }
+ }
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 0000000..8a1fda3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MSR_H_
+#define _VMM_MSR_H_
+
+#define VMM_MSR_NUM 16
+struct vm;
+
+void vmm_msr_init(void);
+int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void guest_msrs_init(struct vm *vm, int cpu);
+void guest_msr_valid(int msr);
+void restore_host_msrs(struct vm *vm, int cpu);
+void restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 0000000..ae60979
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+ struct vmm_stat_type *vst = arg;
+
+ /* We require all stats to identify themselves with a description */
+ if (vst->desc == NULL)
+ return;
+
+ if (vstnum >= MAX_VMM_STAT_TYPES) {
+ printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+ return;
+ }
+
+ vst->index = vstnum;
+ vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+ int i;
+ uint64_t *stats;
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ stats = vcpu_stats(vm, vcpu);
+ for (i = 0; i < vstnum; i++)
+ buf[i] = stats[i];
+ *num_stats = vstnum;
+ return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+ u_long size;
+
+ size = vstnum * sizeof(uint64_t);
+
+ return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+ free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+ if (index >= 0 && index < vstnum)
+ return (vsttab[index]->desc);
+ else
+ return (NULL);
+}
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 0000000..7c075a6
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define _VMM_STAT_H_
+
+struct vm;
+
+#define MAX_VMM_STAT_TYPES 64 /* arbitrary */
+
+struct vmm_stat_type {
+ const char *desc; /* description of statistic */
+ int index; /* position in the stats buffer */
+};
+
+void vmm_stat_init(void *arg);
+
+#define VMM_STAT_DEFINE(type, desc) \
+ struct vmm_stat_type type[1] = { \
+ { desc, -1 } \
+ }; \
+ SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+void *vmm_stat_alloc(void);
+void vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef VMM_KEEP_STATS
+ uint64_t *stats = vcpu_stats(vm, vcpu);
+ if (vst->index >= 0)
+ stats[vst->index] += x;
+#endif
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 0000000..2afc608
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define LOCORE
+
+#include <machine/asmacros.h>
+
+#define LA_EOI 0xB0
+
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(justreturn)
+ pushq %rax
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax)
+ popq %rax
+ iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 0000000..f245f92
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+ if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+ unsigned int regs[4];
+
+ /*
+ * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+ *
+ * Both Intel and AMD support this bit.
+ */
+ if (cpu_exthigh >= 0x80000001) {
+ do_cpuid(0x80000001, regs);
+ if (regs[3] & (1 << 26))
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+ DUMP_REG(rdi);
+ DUMP_REG(rsi);
+ DUMP_REG(rdx);
+ DUMP_REG(rcx);
+ DUMP_REG(r8);
+ DUMP_REG(r9);
+ DUMP_REG(rax);
+ DUMP_REG(rbx);
+ DUMP_REG(rbp);
+ DUMP_REG(r10);
+ DUMP_REG(r11);
+ DUMP_REG(r12);
+ DUMP_REG(r13);
+ DUMP_REG(r14);
+ DUMP_REG(r15);
+ DUMP_REG(trapno);
+ DUMP_REG(addr);
+ DUMP_REG(flags);
+ DUMP_REG(err);
+ DUMP_REG(rip);
+ DUMP_REG(rflags);
+ DUMP_REG(rsp);
+ DUMP_SEG(cs);
+ DUMP_SEG(ss);
+ DUMP_SEG(fs);
+ DUMP_SEG(gs);
+ DUMP_SEG(es);
+ DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 0000000..7f82332
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define _VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t vmm_is_intel(void);
+boolean_t vmm_is_amd(void);
+boolean_t vmm_supports_1G_pages(void);
+
+void dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 0000000..94abe09
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+
+#include "x86.h"
+
+#define CPUID_VM_HIGH 0x40000000
+
+static const char bhyve_id[12] = "BHyVE BHyVE ";
+
+int
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+ int error;
+ unsigned int func, regs[4];
+ enum x2apic_state x2apic_state;
+
+ func = *eax;
+
+ /*
+ * Requests for invalid CPUID levels should map to the highest
+ * available level instead.
+ */
+ if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+ if (*eax > cpu_exthigh)
+ *eax = cpu_exthigh;
+ } else if (*eax >= 0x40000000) {
+ if (*eax > CPUID_VM_HIGH)
+ *eax = CPUID_VM_HIGH;
+ } else if (*eax > cpu_high) {
+ *eax = cpu_high;
+ }
+
+ /*
+ * In general the approach used for CPU topology is to
+ * advertise a flat topology where all CPUs are packages with
+ * no multi-core or SMT.
+ */
+ switch (func) {
+ case CPUID_0000_0000:
+ case CPUID_0000_0002:
+ case CPUID_0000_0003:
+ case CPUID_0000_000A:
+ cpuid_count(*eax, *ecx, regs);
+ break;
+
+ case CPUID_8000_0000:
+ case CPUID_8000_0001:
+ case CPUID_8000_0002:
+ case CPUID_8000_0003:
+ case CPUID_8000_0004:
+ case CPUID_8000_0006:
+ case CPUID_8000_0007:
+ case CPUID_8000_0008:
+ cpuid_count(*eax, *ecx, regs);
+ break;
+
+ case CPUID_0000_0001:
+ do_cpuid(1, regs);
+
+ error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+ if (error) {
+ panic("x86_emulate_cpuid: error %d "
+ "fetching x2apic state", error);
+ }
+
+ /*
+ * Override the APIC ID only in ebx
+ */
+ regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+ regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
+
+ /*
+ * Don't expose VMX, SpeedStep or TME capability.
+ * Advertise x2APIC capability and Hypervisor guest.
+ */
+ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+
+ regs[2] |= CPUID2_HV;
+
+ if (x2apic_state != X2APIC_DISABLED)
+ regs[2] |= CPUID2_X2APIC;
+
+ /*
+ * Hide xsave/osxsave/avx until the FPU save/restore
+ * issues are resolved
+ */
+ regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+ CPUID2_AVX);
+
+ /*
+ * Hide monitor/mwait until we know how to deal with
+ * these instructions.
+ */
+ regs[2] &= ~CPUID2_MON;
+
+ /*
+ * Hide thermal monitoring
+ */
+ regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+
+ /*
+ * Machine check handling is done in the host.
+ * Hide MTRR capability.
+ */
+ regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+ /*
+ * Disable multi-core.
+ */
+ regs[1] &= ~CPUID_HTT_CORES;
+ regs[3] &= ~CPUID_HTT;
+ break;
+
+ case CPUID_0000_0004:
+ do_cpuid(4, regs);
+
+ /*
+ * Do not expose topology.
+ */
+ regs[0] &= 0xffff8000;
+ regs[0] |= 0x04008000;
+ break;
+
+ case CPUID_0000_0006:
+ case CPUID_0000_0007:
+ /*
+ * Handle the access, but report 0 for
+ * all options
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = 0;
+ regs[3] = 0;
+ break;
+
+ case CPUID_0000_000B:
+ /*
+ * Processor topology enumeration
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = *ecx & 0xff;
+ regs[3] = vcpu_id;
+ break;
+
+ case 0x40000000:
+ regs[0] = CPUID_VM_HIGH;
+ bcopy(bhyve_id, &regs[1], 4);
+ bcopy(bhyve_id, &regs[2], 4);
+ bcopy(bhyve_id, &regs[3], 4);
+ break;
+ default:
+ /* XXX: Leaf 5? */
+ return (0);
+ }
+
+ *eax = regs[0];
+ *ebx = regs[1];
+ *ecx = regs[2];
+ *edx = regs[3];
+ return (1);
+}
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 0000000..368e967
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define _X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001 (0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
+#define CPUID_0000_000A (0xA)
+#define CPUID_0000_000B (0xB)
+#define CPUID_8000_0000 (0x80000000)
+#define CPUID_8000_0001 (0x80000001)
+#define CPUID_8000_0002 (0x80000002)
+#define CPUID_8000_0003 (0x80000003)
+#define CPUID_8000_0004 (0x80000004)
+#define CPUID_8000_0006 (0x80000006)
+#define CPUID_8000_0007 (0x80000007)
+#define CPUID_8000_0008 (0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK (0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT 24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX (1<<5)
+
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx);
+
+#endif
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 56c7437..f381c71 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -464,6 +464,11 @@ libkern/memset.c standard
compat/x86bios/x86bios.c optional x86bios | atkbd | dpms | vesa
contrib/x86emu/x86emu.c optional x86bios | atkbd | dpms | vesa
#
+# bvm console
+#
+dev/bvm/bvm_console.c optional bvmconsole
+dev/bvm/bvm_dbg.c optional bvmdebug
+#
# x86 shared code between IA32, AMD64 and PC98 architectures
#
x86/acpica/OsdEnvironment.c optional acpi
diff --git a/sys/dev/blackhole/blackhole.c b/sys/dev/blackhole/blackhole.c
new file mode 100644
index 0000000..9d02e50
--- /dev/null
+++ b/sys/dev/blackhole/blackhole.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/linker.h>
+#include <sys/libkern.h>
+
+#include <dev/pci/pcivar.h>
+
+static int
+linker_file_iterator(linker_file_t lf, void *arg)
+{
+ const char *file = arg;
+
+ if (strcmp(lf->filename, file) == 0)
+ return (1);
+ else
+ return (0);
+}
+
+static boolean_t
+pptdev(int bus, int slot, int func)
+{
+ int found, b, s, f, n;
+ char *val, *cp, *cp2;
+
+ /*
+ * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+ */
+ found = 0;
+ cp = val = getenv("pptdevs");
+ while (cp != NULL && *cp != '\0') {
+ if ((cp2 = strchr(cp, ' ')) != NULL)
+ *cp2 = '\0';
+
+ n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+ if (n == 3 && bus == b && slot == s && func == f) {
+ found = 1;
+ break;
+ }
+
+ if (cp2 != NULL)
+ *cp2++ = ' ';
+
+ cp = cp2;
+ }
+ freeenv(val);
+ return (found);
+}
+
+static int
+pci_blackhole_probe(device_t dev)
+{
+ int bus, slot, func;
+
+ /*
+ * If 'vmm.ko' has also been loaded the don't try to claim
+ * any pci devices.
+ */
+ if (linker_file_foreach(linker_file_iterator, "vmm.ko"))
+ return (ENXIO);
+
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+ if (pptdev(bus, slot, func))
+ return (0);
+ else
+ return (ENXIO);
+}
+
+static int
+pci_blackhole_attach(device_t dev)
+{
+ /*
+ * We never really want to claim the devices but just want to prevent
+ * other drivers from getting to them.
+ */
+ return (ENXIO);
+}
+
+static device_method_t pci_blackhole_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, pci_blackhole_probe),
+ DEVMETHOD(device_attach, pci_blackhole_attach),
+
+ { 0, 0 }
+};
+
+static driver_t pci_blackhole_driver = {
+ "blackhole",
+ pci_blackhole_methods,
+};
+
+devclass_t blackhole_devclass;
+
+DRIVER_MODULE(blackhole, pci, pci_blackhole_driver, blackhole_devclass, 0, 0);
+MODULE_DEPEND(blackhole, pci, 1, 1, 1);
diff --git a/sys/dev/bvm/bvm_console.c b/sys/dev/bvm/bvm_console.c
new file mode 100644
index 0000000..a0e70e5
--- /dev/null
+++ b/sys/dev/bvm/bvm_console.c
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/cons.h>
+#include <sys/tty.h>
+#include <sys/reboot.h>
+#include <sys/bus.h>
+
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+
+#ifndef BVMCONS_POLL_HZ
+#define BVMCONS_POLL_HZ 4
+#endif
+#define BVMBURSTLEN 16 /* max number of bytes to write in one chunk */
+
+static tsw_open_t bvm_tty_open;
+static tsw_close_t bvm_tty_close;
+static tsw_outwakeup_t bvm_tty_outwakeup;
+
+static struct ttydevsw bvm_ttydevsw = {
+ .tsw_flags = TF_NOPREFIX,
+ .tsw_open = bvm_tty_open,
+ .tsw_close = bvm_tty_close,
+ .tsw_outwakeup = bvm_tty_outwakeup,
+};
+
+static int polltime;
+static struct callout_handle bvm_timeouthandle
+ = CALLOUT_HANDLE_INITIALIZER(&bvm_timeouthandle);
+
+#if defined(KDB)
+static int alt_break_state;
+#endif
+
+#define BVM_CONS_PORT 0x220
+static int bvm_cons_port = BVM_CONS_PORT;
+
+#define BVM_CONS_SIG ('b' << 8 | 'v')
+
+static void bvm_timeout(void *);
+
+static cn_probe_t bvm_cnprobe;
+static cn_init_t bvm_cninit;
+static cn_term_t bvm_cnterm;
+static cn_getc_t bvm_cngetc;
+static cn_putc_t bvm_cnputc;
+static cn_grab_t bvm_cngrab;
+static cn_ungrab_t bvm_cnungrab;
+
+CONSOLE_DRIVER(bvm);
+
+static int
+bvm_rcons(u_char *ch)
+{
+ int c;
+
+ c = inl(bvm_cons_port);
+ if (c != -1) {
+ *ch = (u_char)c;
+ return (0);
+ } else
+ return (-1);
+}
+
+static void
+bvm_wcons(u_char ch)
+{
+
+ outl(bvm_cons_port, ch);
+}
+
+static void
+cn_drvinit(void *unused)
+{
+ struct tty *tp;
+
+ if (bvm_consdev.cn_pri != CN_DEAD &&
+ bvm_consdev.cn_name[0] != '\0') {
+ tp = tty_alloc(&bvm_ttydevsw, NULL);
+ tty_makedev(tp, NULL, "bvmcons");
+ }
+}
+
+static int
+bvm_tty_open(struct tty *tp)
+{
+ polltime = hz / BVMCONS_POLL_HZ;
+ if (polltime < 1)
+ polltime = 1;
+ bvm_timeouthandle = timeout(bvm_timeout, tp, polltime);
+
+ return (0);
+}
+
+static void
+bvm_tty_close(struct tty *tp)
+{
+
+ /* XXX Should be replaced with callout_stop(9) */
+ untimeout(bvm_timeout, tp, bvm_timeouthandle);
+}
+
+static void
+bvm_tty_outwakeup(struct tty *tp)
+{
+ int len, written;
+ u_char buf[BVMBURSTLEN];
+
+ for (;;) {
+ len = ttydisc_getc(tp, buf, sizeof(buf));
+ if (len == 0)
+ break;
+
+ written = 0;
+ while (written < len)
+ bvm_wcons(buf[written++]);
+ }
+}
+
+static void
+bvm_timeout(void *v)
+{
+ struct tty *tp;
+ int c;
+
+ tp = (struct tty *)v;
+
+ tty_lock(tp);
+ while ((c = bvm_cngetc(NULL)) != -1)
+ ttydisc_rint(tp, c, 0);
+ ttydisc_rint_done(tp);
+ tty_unlock(tp);
+
+ bvm_timeouthandle = timeout(bvm_timeout, tp, polltime);
+}
+
+static void
+bvm_cnprobe(struct consdev *cp)
+{
+ int disabled, port;
+
+ disabled = 0;
+ cp->cn_pri = CN_DEAD;
+
+ resource_int_value("bvmconsole", 0, "disabled", &disabled);
+ if (!disabled) {
+ if (resource_int_value("bvmconsole", 0, "port", &port) == 0)
+ bvm_cons_port = port;
+
+ if (inw(bvm_cons_port) == BVM_CONS_SIG)
+ cp->cn_pri = CN_REMOTE;
+ }
+}
+
+static void
+bvm_cninit(struct consdev *cp)
+{
+ int i;
+ const char *bootmsg = "Using bvm console.\n";
+
+ if (boothowto & RB_VERBOSE) {
+ for (i = 0; i < strlen(bootmsg); i++)
+ bvm_cnputc(cp, bootmsg[i]);
+ }
+
+ strcpy(cp->cn_name, "bvmcons");
+}
+
+static void
+bvm_cnterm(struct consdev *cp)
+{
+
+}
+
+static int
+bvm_cngetc(struct consdev *cp)
+{
+ unsigned char ch;
+
+ if (bvm_rcons(&ch) == 0) {
+#if defined(KDB)
+ kdb_alt_break(ch, &alt_break_state);
+#endif
+ return (ch);
+ }
+
+ return (-1);
+}
+
+static void
+bvm_cnputc(struct consdev *cp, int c)
+{
+
+ bvm_wcons(c);
+}
+
+static void
+bvm_cngrab(struct consdev *cp)
+{
+}
+
+static void
+bvm_cnungrab(struct consdev *cp)
+{
+}
+
+SYSINIT(cndev, SI_SUB_CONFIGURE, SI_ORDER_MIDDLE, cn_drvinit, NULL);
diff --git a/sys/dev/bvm/bvm_dbg.c b/sys/dev/bvm/bvm_dbg.c
new file mode 100644
index 0000000..1ba7ce0
--- /dev/null
+++ b/sys/dev/bvm/bvm_dbg.c
@@ -0,0 +1,100 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+
+#include <gdb/gdb.h>
+
+#include <machine/cpufunc.h>
+
+static gdb_probe_f bvm_dbg_probe;
+static gdb_init_f bvm_dbg_init;
+static gdb_term_f bvm_dbg_term;
+static gdb_getc_f bvm_dbg_getc;
+static gdb_putc_f bvm_dbg_putc;
+
+GDB_DBGPORT(bvm, bvm_dbg_probe, bvm_dbg_init, bvm_dbg_term,
+ bvm_dbg_getc, bvm_dbg_putc);
+
+#define BVM_DBG_PORT 0x224
+static int bvm_dbg_port = BVM_DBG_PORT;
+
+#define BVM_DBG_SIG ('B' << 8 | 'V')
+
+static int
+bvm_dbg_probe(void)
+{
+ int disabled, port;
+
+ disabled = 0;
+ resource_int_value("bvmdbg", 0, "disabled", &disabled);
+
+ if (!disabled) {
+ if (resource_int_value("bvmdbg", 0, "port", &port) == 0)
+ bvm_dbg_port = port;
+
+ if (inw(bvm_dbg_port) == BVM_DBG_SIG) {
+ /*
+ * Return a higher priority than 0 to override other
+ * gdb dbgport providers that may be present (e.g. uart)
+ */
+ return (1);
+ }
+ }
+
+ return (-1);
+}
+
+static void
+bvm_dbg_init(void)
+{
+}
+
+static void
+bvm_dbg_term(void)
+{
+}
+
+static void
+bvm_dbg_putc(int c)
+{
+
+ outl(bvm_dbg_port, c);
+}
+
+static int
+bvm_dbg_getc(void)
+{
+
+ return (inl(bvm_dbg_port));
+}
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 1344297..a53f640 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -48,6 +48,7 @@ SUBDIR= \
${_bxe} \
${_bios} \
${_bktr} \
+ ${_blackhole} \
${_bm} \
bridgestp \
bwi \
@@ -335,6 +336,7 @@ SUBDIR= \
vge \
${_viawd} \
vkbd \
+ ${_vmm} \
${_vpo} \
vr \
vte \
@@ -624,6 +626,7 @@ _amdtemp= amdtemp
_arcmsr= arcmsr
_asmc= asmc
_bktr= bktr
+_blackhole= blackhole
_bxe= bxe
_cardbus= cardbus
_cbb= cbb
@@ -720,6 +723,7 @@ _twa= twa
_vesa= vesa
_viawd= viawd
_virtio= virtio
+_vmm= vmm
_vxge= vxge
_x86bios= x86bios
_wbwd= wbwd
diff --git a/sys/modules/blackhole/Makefile b/sys/modules/blackhole/Makefile
new file mode 100644
index 0000000..a73cf44
--- /dev/null
+++ b/sys/modules/blackhole/Makefile
@@ -0,0 +1,9 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../dev/blackhole
+
+KMOD= blackhole
+SRCS= blackhole.c
+SRCS+= bus_if.h device_if.h pci_if.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile
new file mode 100644
index 0000000..8b565da
--- /dev/null
+++ b/sys/modules/vmm/Makefile
@@ -0,0 +1,62 @@
+# $FreeBSD$
+
+KMOD= vmm
+
+SRCS= opt_ddb.h device_if.h bus_if.h pci_if.h
+
+CFLAGS+= -DVMM_KEEP_STATS -DSMP
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm/intel
+
+# generic vmm support
+.PATH: ${.CURDIR}/../../amd64/vmm
+SRCS+= vmm.c \
+ vmm_dev.c \
+ vmm_host.c \
+ vmm_instruction_emul.c \
+ vmm_ipi.c \
+ vmm_lapic.c \
+ vmm_mem.c \
+ vmm_msr.c \
+ vmm_stat.c \
+ vmm_util.c \
+ x86.c \
+ vmm_support.S
+
+.PATH: ${.CURDIR}/../../amd64/vmm/io
+SRCS+= iommu.c \
+ ppt.c \
+ vdev.c \
+ vlapic.c
+
+# intel-specific files
+.PATH: ${.CURDIR}/../../amd64/vmm/intel
+SRCS+= ept.c \
+ vmcs.c \
+ vmx_msr.c \
+ vmx.c \
+ vtd.c
+
+# amd-specific files
+.PATH: ${.CURDIR}/../../amd64/vmm/amd
+SRCS+= amdv.c
+
+OBJS= vmx_support.o
+
+CLEANFILES= vmx_assym.s vmx_genassym.o
+
+vmx_assym.s: vmx_genassym.o
+.if exists(@)
+vmx_assym.s: @/kern/genassym.sh
+.endif
+ sh @/kern/genassym.sh vmx_genassym.o > ${.TARGET}
+
+vmx_support.o: vmx_support.S vmx_assym.s
+ ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
+ ${.IMPSRC} -o ${.TARGET}
+
+vmx_genassym.o: vmx_genassym.c @ machine x86
+ ${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC}
+
+.include <bsd.kmod.mk>
diff --git a/usr.sbin/Makefile.amd64 b/usr.sbin/Makefile.amd64
index 1a1bffe..5ee2165 100644
--- a/usr.sbin/Makefile.amd64
+++ b/usr.sbin/Makefile.amd64
@@ -10,6 +10,9 @@ SUBDIR+= acpi
SUBDIR+= apm
.endif
SUBDIR+= asf
+SUBDIR+= bhyve
+SUBDIR+= bhyvectl
+SUBDIR+= bhyveload
SUBDIR+= boot0cfg
.if ${MK_TOOLCHAIN} != "no"
SUBDIR+= btxld
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
new file mode 100644
index 0000000..078ef9a
--- /dev/null
+++ b/usr.sbin/bhyve/Makefile
@@ -0,0 +1,27 @@
+#
+# $FreeBSD$
+#
+
+PROG= bhyve
+
+DEBUG_FLAGS= -g -O0
+
+SRCS= acpi.c atpic.c bhyverun.c consport.c dbgport.c elcr.c inout.c
+SRCS+= ioapic.c mem.c mevent.c mptbl.c
+SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
+SRCS+= pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c uart.c
+SRCS+= xmsr.c spinup_ap.c
+
+.PATH: ${.CURDIR}/../../sys/amd64/vmm
+SRCS+= vmm_instruction_emul.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
+LDADD= -lvmmapi -lmd -lpthread
+
+WARNS?= 2
+
+CFLAGS+= -I${.CURDIR}/../../sys
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
new file mode 100644
index 0000000..32effdc
--- /dev/null
+++ b/usr.sbin/bhyve/acpi.c
@@ -0,0 +1,844 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * bhyve ACPI table generator.
+ *
+ * Create the minimal set of ACPI tables required to boot FreeBSD (and
+ * hopefully other o/s's) by writing out ASL template files for each of
+ * the tables and the compiling them to AML with the Intel iasl compiler.
+ * The AML files are then read into guest memory.
+ *
+ * The tables are placed in the guest's ROM area just below 1MB physical,
+ * above the MPTable.
+ *
+ * Layout
+ * ------
+ * RSDP -> 0xf0400 (36 bytes fixed)
+ * RSDT -> 0xf0440 (36 bytes + 4*N table addrs, 2 used)
+ * XSDT -> 0xf0480 (36 bytes + 8*N table addrs, 2 used)
+ * MADT -> 0xf0500 (depends on #CPUs)
+ * FADT -> 0xf0600 (268 bytes)
+ * FACS -> 0xf0780 (64 bytes)
+ * DSDT -> 0xf0800 (variable - can go up to 0x100000)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+
+/*
+ * Define the base address of the ACPI tables, and the offsets to
+ * the individual tables
+ */
+#define BHYVE_ACPI_BASE 0xf0400
+#define RSDT_OFFSET 0x040
+#define XSDT_OFFSET 0x080
+#define MADT_OFFSET 0x100
+#define FADT_OFFSET 0x200
+#define FACS_OFFSET 0x380
+#define DSDT_OFFSET 0x400
+
+#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX"
+#define BHYVE_ASL_SUFFIX ".aml"
+#define BHYVE_ASL_COMPILER "/usr/sbin/iasl"
+
+#define BHYVE_PM_TIMER_ADDR 0x408
+
+static int basl_keep_temps;
+static int basl_verbose_iasl;
+static int basl_ncpu;
+static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
+
+/*
+ * Contains the full pathname of the template to be passed
+ * to mkstemp/mktemps(3)
+ */
+static char basl_template[MAXPATHLEN];
+static char basl_stemplate[MAXPATHLEN];
+
+struct basl_fio {
+ int fd;
+ FILE *fp;
+ char f_name[MAXPATHLEN];
+};
+
+#define EFPRINTF(...) \
+ err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit;
+
+#define EFFLUSH(x) \
+ err = fflush(x); if (err != 0) goto err_exit;
+
+static int
+basl_fwrite_rsdp(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve RSDP template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
+ EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
+ basl_acpi_base + RSDT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
+ EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
+ basl_acpi_base + XSDT_OFFSET);
+ EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_rsdt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve RSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add in pointers to the MADT and FADT */
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
+ basl_acpi_base + MADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
+ basl_acpi_base + FADT_OFFSET);
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_xsdt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve XSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add in pointers to the MADT and FADT */
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
+ basl_acpi_base + MADT_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
+ basl_acpi_base + FADT_OFFSET);
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_madt(FILE *fp)
+{
+ int err;
+ int i;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve MADT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+ EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
+ EFPRINTF(fp, "\n");
+
+ /* Add a Processor Local APIC entry for each CPU */
+ for (i = 0; i < basl_ncpu; i++) {
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 08\n");
+ EFPRINTF(fp, "[0001]\t\tProcessor ID : %02d\n", i);
+ EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02d\n", i);
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+ EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
+ EFPRINTF(fp, "\n");
+ }
+
+ /* Always a single IOAPIC entry, with ID ncpu+1 */
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
+ EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02d\n", basl_ncpu);
+ EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+ EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
+ EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Override the 8259 chained vector. XXX maybe not needed */
+ EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
+ EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
+ EFPRINTF(fp, "[0001]\t\tBus : 00\n");
+ EFPRINTF(fp, "[0001]\t\tSource : 09\n");
+ EFPRINTF(fp, "[0004]\t\tInterrupt : 00000009\n");
+ EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
+ EFPRINTF(fp, "\t\t\tPolarity : 0\n");
+ EFPRINTF(fp, "\t\t\tTrigger Mode : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_fadt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve FADT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
+ basl_acpi_base + FACS_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
+ basl_acpi_base + DSDT_OFFSET);
+ EFPRINTF(fp, "[0001]\t\tModel : 00\n");
+ EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
+ EFPRINTF(fp, "[0002]\t\tSCI Interrupt : 0009\n");
+ EFPRINTF(fp, "[0004]\t\tSMI Command Port : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tACPI Enable Value : 00\n");
+ EFPRINTF(fp, "[0001]\t\tACPI Disable Value : 00\n");
+ EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
+ EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
+ EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
+ BHYVE_PM_TIMER_ADDR);
+ EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
+ EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
+ EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
+ EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
+ EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
+ EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
+ EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
+ EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
+ EFPRINTF(fp, "[0001]\t\tRTC Century Index : 00\n");
+ EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
+ EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
+ EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
+ EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
+ EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 0\n");
+ EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
+ EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
+ EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
+ EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
+ EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
+ EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
+ EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tReset Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0001]\t\tValue to cause reset : 00\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+ EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
+ basl_acpi_base + FACS_OFFSET);
+ EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
+ basl_acpi_base + DSDT_OFFSET);
+ EFPRINTF(fp,
+ "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ /* Valid for bhyve */
+ EFPRINTF(fp,
+ "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 32\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+ BHYVE_PM_TIMER_ADDR);
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 80\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp,
+ "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp,
+ "[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
+ EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+ EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+ EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+ EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+ EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_facs(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve FACS template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
+ EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
+ EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
+ EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
+ EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
+ EFPRINTF(fp,
+ "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
+ EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
+ EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+ EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
+ EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_fwrite_dsdt(FILE *fp)
+{
+ int err;
+
+ err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve DSDT template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
+ "\"BHYVE \", \"BVDSDT \", 0x00000001)\n");
+ EFPRINTF(fp, "{\n");
+ EFPRINTF(fp, " Scope (_SB)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Device (PCI0)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Name (_HID, EisaId (\"PNP0A03\"))\n");
+ EFPRINTF(fp, " Name (_ADR, Zero)\n");
+ EFPRINTF(fp, " Name (_UID, One)\n");
+ EFPRINTF(fp, " Name (_CRS, ResourceTemplate ()\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " WordBusNumber (ResourceProducer, MinFixed,"
+ "MaxFixed, PosDecode,\n");
+ EFPRINTF(fp, " 0x0000, // Granularity\n");
+ EFPRINTF(fp, " 0x0000, // Range Minimum\n");
+ EFPRINTF(fp, " 0x00FF, // Range Maximum\n");
+ EFPRINTF(fp, " 0x0000, // Transl Offset\n");
+ EFPRINTF(fp, " 0x0100, // Length\n");
+ EFPRINTF(fp, " ,, )\n");
+ EFPRINTF(fp, " IO (Decode16,\n");
+ EFPRINTF(fp, " 0x0CF8, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0CF8, // Range Maximum\n");
+ EFPRINTF(fp, " 0x01, // Alignment\n");
+ EFPRINTF(fp, " 0x08, // Length\n");
+ EFPRINTF(fp, " )\n");
+ EFPRINTF(fp, " WordIO (ResourceProducer, MinFixed, MaxFixed,"
+ "PosDecode, EntireRange,\n");
+ EFPRINTF(fp, " 0x0000, // Granularity\n");
+ EFPRINTF(fp, " 0x0000, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0CF7, // Range Maximum\n");
+ EFPRINTF(fp, " 0x0000, // Transl Offset\n");
+ EFPRINTF(fp, " 0x0CF8, // Length\n");
+ EFPRINTF(fp, " ,, , TypeStatic)\n");
+ EFPRINTF(fp, " WordIO (ResourceProducer, MinFixed, MaxFixed,"
+ "PosDecode, EntireRange,\n");
+ EFPRINTF(fp, " 0x0000, // Granularity\n");
+ EFPRINTF(fp, " 0x0D00, // Range Minimum\n");
+ EFPRINTF(fp, " 0xFFFF, // Range Maximum\n");
+ EFPRINTF(fp, " 0x0000, // Transl Offset\n");
+ EFPRINTF(fp, " 0xF300, // Length\n");
+ EFPRINTF(fp, " ,, , TypeStatic)\n");
+ EFPRINTF(fp, " })\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, "\n");
+ EFPRINTF(fp, " Scope (_SB.PCI0)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Device (ISA)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Name (_ADR, 0x00010000)\n");
+ EFPRINTF(fp, " OperationRegion (P40C, PCI_Config, 0x60, 0x04)\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, "\n");
+ EFPRINTF(fp, " Scope (_SB.PCI0.ISA)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Device (RTC)\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " Name (_HID, EisaId (\"PNP0B00\"))\n");
+ EFPRINTF(fp, " Name (_CRS, ResourceTemplate ()\n");
+ EFPRINTF(fp, " {\n");
+ EFPRINTF(fp, " IO (Decode16,\n");
+ EFPRINTF(fp, " 0x0070, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0070, // Range Maximum\n");
+ EFPRINTF(fp, " 0x10, // Alignment\n");
+ EFPRINTF(fp, " 0x02, // Length\n");
+ EFPRINTF(fp, " )\n");
+ EFPRINTF(fp, " IRQNoFlags ()\n");
+ EFPRINTF(fp, " {8}\n");
+ EFPRINTF(fp, " IO (Decode16,\n");
+ EFPRINTF(fp, " 0x0072, // Range Minimum\n");
+ EFPRINTF(fp, " 0x0072, // Range Maximum\n");
+ EFPRINTF(fp, " 0x02, // Alignment\n");
+ EFPRINTF(fp, " 0x06, // Length\n");
+ EFPRINTF(fp, " )\n");
+ EFPRINTF(fp, " })\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, " }\n");
+ EFPRINTF(fp, "}\n");
+
+ EFFLUSH(fp);
+
+ return (0);
+
+err_exit:
+ return (errno);
+}
+
+static int
+basl_open(struct basl_fio *bf, int suffix)
+{
+ int err;
+
+ err = 0;
+
+ if (suffix) {
+ strncpy(bf->f_name, basl_stemplate, MAXPATHLEN);
+ bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
+ } else {
+ strncpy(bf->f_name, basl_template, MAXPATHLEN);
+ bf->fd = mkstemp(bf->f_name);
+ }
+
+ if (bf->fd > 0) {
+ bf->fp = fdopen(bf->fd, "w+");
+ if (bf->fp == NULL) {
+ unlink(bf->f_name);
+ close(bf->fd);
+ }
+ } else {
+ err = 1;
+ }
+
+ return (err);
+}
+
+static void
+basl_close(struct basl_fio *bf)
+{
+
+ if (!basl_keep_temps)
+ unlink(bf->f_name);
+ fclose(bf->fp);
+}
+
+static int
+basl_start(struct basl_fio *in, struct basl_fio *out)
+{
+ int err;
+
+ err = basl_open(in, 0);
+ if (!err) {
+ err = basl_open(out, 1);
+ if (err) {
+ basl_close(in);
+ }
+ }
+
+ return (err);
+}
+
+static void
+basl_end(struct basl_fio *in, struct basl_fio *out)
+{
+
+ basl_close(in);
+ basl_close(out);
+}
+
+static int
+basl_load(int fd, uint64_t off)
+{
+ struct stat sb;
+ int err;
+
+ err = 0;
+
+ if (fstat(fd, &sb) < 0 ||
+ read(fd, paddr_guest2host(basl_acpi_base + off), sb.st_size) < 0)
+ err = errno;
+
+ return (err);
+}
+
+static int
+basl_compile(int (*fwrite_section)(FILE *fp), uint64_t offset)
+{
+ struct basl_fio io[2];
+ static char iaslbuf[3*MAXPATHLEN + 10];
+ char *fmt;
+ int err;
+
+ err = basl_start(&io[0], &io[1]);
+ if (!err) {
+ err = (*fwrite_section)(io[0].fp);
+
+ if (!err) {
+ /*
+ * iasl sends the results of the compilation to
+ * stdout. Shut this down by using the shell to
+ * redirect stdout to /dev/null, unless the user
+ * has requested verbose output for debugging
+ * purposes
+ */
+ fmt = basl_verbose_iasl ?
+ "%s -p %s %s" :
+ "/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
+
+ snprintf(iaslbuf, sizeof(iaslbuf),
+ fmt,
+ BHYVE_ASL_COMPILER,
+ io[1].f_name, io[0].f_name);
+ err = system(iaslbuf);
+
+ if (!err) {
+ /*
+ * Copy the aml output file into guest
+ * memory at the specified location
+ */
+ err = basl_load(io[1].fd, offset);
+ }
+ }
+ basl_end(&io[0], &io[1]);
+ }
+
+ return (err);
+}
+
+static int
+basl_make_templates(void)
+{
+ const char *tmpdir;
+ int err;
+ int len;
+
+ err = 0;
+
+ /*
+ *
+ */
+ if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
+ (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
+ tmpdir = _PATH_TMP;
+ }
+
+ len = strlen(tmpdir);
+
+ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
+ strcpy(basl_template, tmpdir);
+ while (len > 0 && basl_template[len - 1] == '/')
+ len--;
+ basl_template[len] = '/';
+ strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
+ } else
+ err = E2BIG;
+
+ if (!err) {
+ /*
+ * len has been intialized (and maybe adjusted) above
+ */
+ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
+ sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
+ strcpy(basl_stemplate, tmpdir);
+ basl_stemplate[len] = '/';
+ strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
+ len = strlen(basl_stemplate);
+ strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
+ } else
+ err = E2BIG;
+ }
+
+ return (err);
+}
+
+static struct {
+ int (*wsect)(FILE *fp);
+ uint64_t offset;
+} basl_ftables[] =
+{
+ { basl_fwrite_rsdp, 0},
+ { basl_fwrite_rsdt, RSDT_OFFSET },
+ { basl_fwrite_xsdt, XSDT_OFFSET },
+ { basl_fwrite_madt, MADT_OFFSET },
+ { basl_fwrite_fadt, FADT_OFFSET },
+ { basl_fwrite_facs, FACS_OFFSET },
+ { basl_fwrite_dsdt, DSDT_OFFSET },
+ { NULL }
+};
+
+int
+acpi_build(struct vmctx *ctx, int ncpu, int ioapic)
+{
+ int err;
+ int i;
+
+ err = 0;
+ basl_ncpu = ncpu;
+
+ if (!ioapic) {
+ fprintf(stderr, "ACPI tables require an ioapic\n");
+ return (EINVAL);
+ }
+
+ /*
+ * For debug, allow the user to have iasl compiler output sent
+ * to stdout rather than /dev/null
+ */
+ if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
+ basl_verbose_iasl = 1;
+
+ /*
+ * Allow the user to keep the generated ASL files for debugging
+ * instead of deleting them following use
+ */
+ if (getenv("BHYVE_ACPI_KEEPTMPS"))
+ basl_keep_temps = 1;
+
+ i = 0;
+ err = basl_make_templates();
+
+ /*
+ * Run through all the ASL files, compiling them and
+ * copying them into guest memory
+ */
+ while (!err && basl_ftables[i].wsect != NULL) {
+ err = basl_compile(basl_ftables[i].wsect,
+ basl_ftables[i].offset);
+ i++;
+ }
+
+ return (err);
+}
diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h
new file mode 100644
index 0000000..fec6c9d
--- /dev/null
+++ b/usr.sbin/bhyve/acpi.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ACPI_H_
+#define _ACPI_H_
+
+int acpi_build(struct vmctx *ctx, int ncpu, int ioapic);
+
+#endif /* _ACPI_H_ */
diff --git a/usr.sbin/bhyve/atpic.c b/usr.sbin/bhyve/atpic.c
new file mode 100644
index 0000000..a9fb084
--- /dev/null
+++ b/usr.sbin/bhyve/atpic.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "inout.h"
+
+/*
+ * FreeBSD only writes to the 8259 interrupt controllers to put them in a
+ * shutdown state.
+ *
+ * So, we just ignore the writes.
+ */
+
+#define IO_ICU1 0x20
+#define IO_ICU2 0xA0
+#define ICU_IMR_OFFSET 1
+
+static int
+atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (bytes != 1)
+ return (-1);
+
+ if (in)
+ return (-1);
+
+ /* Pretend all writes to the 8259 are alright */
+ return (0);
+}
+
+INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
new file mode 100644
index 0000000..999040f
--- /dev/null
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -0,0 +1,788 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <machine/segments.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "mem.h"
+#include "mevent.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "xmsr.h"
+#include "ioapic.h"
+#include "spinup_ap.h"
+
+#define DEFAULT_GUEST_HZ 100
+#define DEFAULT_GUEST_TSLICE 200
+
+#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
+
+#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */
+#define VMEXIT_CONTINUE 1 /* continue from next instruction */
+#define VMEXIT_RESTART 2 /* restart current instruction */
+#define VMEXIT_ABORT 3 /* abort the vm run loop */
+#define VMEXIT_RESET 4 /* guest machine has reset */
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+
+int guest_tslice = DEFAULT_GUEST_TSLICE;
+int guest_hz = DEFAULT_GUEST_HZ;
+char *vmname;
+
+u_long lomem_sz;
+u_long himem_sz;
+
+int guest_ncpus;
+
+static int pincpu = -1;
+static int guest_vcpu_mux;
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic;
+
+static int foundcpus;
+
+static int strictio;
+
+static int acpi;
+
+static char *lomem_addr;
+static char *himem_addr;
+
+static char *progname;
+static const int BSP = 0;
+
+static int cpumask;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+struct vm_exit vmexit[VM_MAXCPU];
+
+struct fbsdstats {
+ uint64_t vmexit_bogus;
+ uint64_t vmexit_bogus_switch;
+ uint64_t vmexit_hlt;
+ uint64_t vmexit_pause;
+ uint64_t vmexit_mtrap;
+ uint64_t vmexit_paging;
+ uint64_t cpu_switch_rotate;
+ uint64_t cpu_switch_direct;
+ int io_reset;
+} stats;
+
+struct mt_vmm_info {
+ pthread_t mt_thr;
+ struct vmctx *mt_ctx;
+ int mt_vcpu;
+} mt_vmm_info[VM_MAXCPU];
+
+static void
+usage(int code)
+{
+
+ fprintf(stderr,
+ "Usage: %s [-aehABHIP][-g <gdb port>][-z <hz>][-s <pci>]"
+ "[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n"
+ " -a: local apic is in XAPIC mode (default is X2APIC)\n"
+ " -A: create an ACPI table\n"
+ " -g: gdb port (default is %d and 0 means don't open)\n"
+ " -c: # cpus (default 1)\n"
+ " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
+ " -B: inject breakpoint exception on vm entry\n"
+ " -H: vmexit from the guest on hlt\n"
+ " -I: present an ioapic to the guest\n"
+ " -P: vmexit from the guest on pause\n"
+ " -e: exit on unhandled i/o access\n"
+ " -h: help\n"
+ " -z: guest hz (default is %d)\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -S: <slot,driver,configinfo> legacy PCI slot config\n"
+ " -m: lowmem in MB\n"
+ " -M: highmem in MB\n"
+ " -x: mux vcpus to 1 hcpu\n"
+ " -t: mux vcpu timeslice hz (default %d)\n",
+ progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
+ DEFAULT_GUEST_TSLICE);
+ exit(code);
+}
+
+void *
+paddr_guest2host(uintptr_t gaddr)
+{
+ if (lomem_sz == 0)
+ return (NULL);
+
+ if (gaddr < lomem_sz) {
+ return ((void *)(lomem_addr + gaddr));
+ } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
+ return ((void *)(himem_addr + gaddr - 4*GB));
+ } else
+ return (NULL);
+}
+
+int
+fbsdrun_disable_x2apic(void)
+{
+
+ return (disable_x2apic);
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+ return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+ return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_muxed(void)
+{
+
+ return (guest_vcpu_mux);
+}
+
+static void *
+fbsdrun_start_thread(void *param)
+{
+ char tname[MAXCOMLEN + 1];
+ struct mt_vmm_info *mtp;
+ int vcpu;
+
+ mtp = param;
+ vcpu = mtp->mt_vcpu;
+
+ snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu);
+ pthread_set_name_np(mtp->mt_thr, tname);
+
+ vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+ /* not reached */
+ exit(1);
+ return (NULL);
+}
+
+void
+fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error;
+
+ if (cpumask & (1 << vcpu)) {
+ fprintf(stderr, "addcpu: attempting to add existing cpu %d\n",
+ vcpu);
+ exit(1);
+ }
+
+ cpumask |= 1 << vcpu;
+ foundcpus++;
+
+ /*
+ * Set up the vmexit struct to allow execution to start
+ * at the given RIP
+ */
+ vmexit[vcpu].rip = rip;
+ vmexit[vcpu].inst_length = 0;
+
+ if (vcpu == BSP || !guest_vcpu_mux){
+ mt_vmm_info[vcpu].mt_ctx = ctx;
+ mt_vmm_info[vcpu].mt_vcpu = vcpu;
+
+ error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+ fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+ assert(error == 0);
+ }
+}
+
+static int
+fbsdrun_get_next_cpu(int curcpu)
+{
+
+ /*
+ * Get the next available CPU. Assumes they arrive
+ * in ascending order with no gaps.
+ */
+ return ((curcpu + 1) % foundcpus);
+}
+
+static int
+vmexit_catch_reset(void)
+{
+ stats.io_reset++;
+ return (VMEXIT_RESET);
+}
+
+static int
+vmexit_catch_inout(void)
+{
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+ uint32_t eax)
+{
+#if PG_DEBUG /* put all types of debug here */
+ if (eax == 0) {
+ pause_noswitch = 1;
+ } else if (eax == 1) {
+ pause_noswitch = 0;
+ } else {
+ pause_noswitch = 0;
+ if (eax == 5) {
+ vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
+ }
+ }
+#endif
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+ int bytes, port, in, out;
+ uint32_t eax;
+ int vcpu;
+
+ vcpu = *pvcpu;
+
+ port = vme->u.inout.port;
+ bytes = vme->u.inout.bytes;
+ eax = vme->u.inout.eax;
+ in = vme->u.inout.in;
+ out = !in;
+
+ /* We don't deal with these */
+ if (vme->u.inout.string || vme->u.inout.rep)
+ return (VMEXIT_ABORT);
+
+ /* Special case of guest reset */
+ if (out && port == 0x64 && (uint8_t)eax == 0xFE)
+ return (vmexit_catch_reset());
+
+ /* Extra-special case of host notifications */
+ if (out && port == GUEST_NIO_PORT)
+ return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
+
+ error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
+ if (error == 0 && in)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
+
+ if (error == 0)
+ return (VMEXIT_CONTINUE);
+ else {
+ fprintf(stderr, "Unhandled %s%c 0x%04x\n",
+ in ? "in" : "out",
+ bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+ return (vmexit_catch_inout());
+ }
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ fprintf(stderr, "vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code,
+ *pvcpu);
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int newcpu;
+ int retval = VMEXIT_CONTINUE;
+
+ newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
+
+ if (guest_vcpu_mux && *pvcpu != newcpu) {
+ retval = VMEXIT_SWITCH;
+ *pvcpu = newcpu;
+ }
+
+ return (retval);
+}
+
+static int
+vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int newcpu;
+ int retval = VMEXIT_CONTINUE;
+
+ newcpu = spinup_ap(ctx, *pvcpu,
+ vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+
+ if (guest_vcpu_mux && *pvcpu != newcpu) {
+ retval = VMEXIT_SWITCH;
+ *pvcpu = newcpu;
+ }
+
+ return (retval);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+ fprintf(stderr, "\treason\t\tVMX\n");
+ fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+ fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+ fprintf(stderr, "\terror\t\t%d\n", vmexit->u.vmx.error);
+ fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+ fprintf(stderr, "\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+
+ return (VMEXIT_ABORT);
+}
+
+static int bogus_noswitch = 1;
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_bogus++;
+
+ if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
+ return (VMEXIT_RESTART);
+ } else {
+ stats.vmexit_bogus_switch++;
+ vmexit->inst_length = 0;
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ }
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_hlt++;
+ if (fbsdrun_muxed()) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ /*
+ * Just continue execution with the next instruction. We use
+ * the HLT VM exit as a way to be friendly with the host
+ * scheduler.
+ */
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int pause_noswitch;
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_pause++;
+
+ if (fbsdrun_muxed() && !pause_noswitch) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_mtrap++;
+
+ return (VMEXIT_RESTART);
+}
+
+static int
+vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ int err;
+ stats.vmexit_paging++;
+
+ err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
+ &vmexit->u.paging.vie);
+
+ if (err) {
+ if (err == EINVAL) {
+ fprintf(stderr,
+ "Failed to emulate instruction at 0x%lx\n",
+ vmexit->rip);
+ } else if (err == ESRCH) {
+ fprintf(stderr, "Unhandled memory access to 0x%lx\n",
+ vmexit->u.paging.gpa);
+ }
+
+ return (VMEXIT_ABORT);
+ }
+
+ return (VMEXIT_CONTINUE);
+}
+
+static void
+sigalrm(int sig)
+{
+ return;
+}
+
+static void
+setup_timeslice(void)
+{
+ struct sigaction sa;
+ struct itimerval itv;
+ int error;
+
+ /*
+ * Setup a realtime timer to generate a SIGALRM at a
+ * frequency of 'guest_tslice' ticks per second.
+ */
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ sa.sa_handler = sigalrm;
+
+ error = sigaction(SIGALRM, &sa, NULL);
+ assert(error == 0);
+
+ itv.it_interval.tv_sec = 0;
+ itv.it_interval.tv_usec = 1000000 / guest_tslice;
+ itv.it_value.tv_sec = 0;
+ itv.it_value.tv_usec = 1000000 / guest_tslice;
+
+ error = setitimer(ITIMER_REAL, &itv, NULL);
+ assert(error == 0);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_PAGING] = vmexit_paging,
+ [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error, rc, prevcpu;
+
+ if (guest_vcpu_mux)
+ setup_timeslice();
+
+ if (pincpu >= 0) {
+ error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
+ assert(error == 0);
+ }
+
+ while (1) {
+ error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
+ if (error != 0) {
+ /*
+ * It is possible that 'vmmctl' or some other process
+ * has transitioned the vcpu to CANNOT_RUN state right
+ * before we tried to transition it to RUNNING.
+ *
+ * This is expected to be temporary so just retry.
+ */
+ if (errno == EBUSY)
+ continue;
+ else
+ break;
+ }
+
+ prevcpu = vcpu;
+ rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
+ &vcpu);
+ switch (rc) {
+ case VMEXIT_SWITCH:
+ assert(guest_vcpu_mux);
+ if (vcpu == -1) {
+ stats.cpu_switch_rotate++;
+ vcpu = fbsdrun_get_next_cpu(prevcpu);
+ } else {
+ stats.cpu_switch_direct++;
+ }
+ /* fall through */
+ case VMEXIT_CONTINUE:
+ rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
+ break;
+ case VMEXIT_RESTART:
+ rip = vmexit[vcpu].rip;
+ break;
+ case VMEXIT_RESET:
+ exit(0);
+ default:
+ exit(1);
+ }
+ }
+ fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+static int
+num_vcpus_allowed(struct vmctx *ctx)
+{
+ int tmp, error;
+
+ error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
+
+ /*
+ * The guest is allowed to spinup more than one processor only if the
+ * UNRESTRICTED_GUEST capability is available.
+ */
+ if (error == 0)
+ return (VM_MAXCPU);
+ else
+ return (1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c, error, gdb_port, inject_bkpt, tmp, err, ioapic, bvmcons;
+ int max_vcpus;
+ struct vmctx *ctx;
+ uint64_t rip;
+
+ bvmcons = 0;
+ inject_bkpt = 0;
+ progname = basename(argv[0]);
+ gdb_port = DEFAULT_GDB_PORT;
+ guest_ncpus = 1;
+ ioapic = 0;
+
+ while ((c = getopt(argc, argv, "abehABHIPxp:g:c:z:s:S:n:m:M:")) != -1) {
+ switch (c) {
+ case 'a':
+ disable_x2apic = 1;
+ break;
+ case 'A':
+ acpi = 1;
+ break;
+ case 'b':
+ bvmcons = 1;
+ break;
+ case 'B':
+ inject_bkpt = 1;
+ break;
+ case 'x':
+ guest_vcpu_mux = 1;
+ break;
+ case 'p':
+ pincpu = atoi(optarg);
+ break;
+ case 'c':
+ guest_ncpus = atoi(optarg);
+ break;
+ case 'g':
+ gdb_port = atoi(optarg);
+ break;
+ case 'z':
+ guest_hz = atoi(optarg);
+ break;
+ case 't':
+ guest_tslice = atoi(optarg);
+ break;
+ case 's':
+ pci_parse_slot(optarg, 0);
+ break;
+ case 'S':
+ pci_parse_slot(optarg, 1);
+ break;
+ case 'm':
+ lomem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'M':
+ himem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'H':
+ guest_vmexit_on_hlt = 1;
+ break;
+ case 'I':
+ ioapic = 1;
+ break;
+ case 'P':
+ guest_vmexit_on_pause = 1;
+ break;
+ case 'e':
+ strictio = 1;
+ break;
+ case 'h':
+ usage(0);
+ default:
+ usage(1);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage(1);
+
+ /* No need to mux if guest is uni-processor */
+ if (guest_ncpus <= 1)
+ guest_vcpu_mux = 0;
+
+ /* vmexit on hlt if guest is muxed */
+ if (guest_vcpu_mux) {
+ guest_vmexit_on_hlt = 1;
+ guest_vmexit_on_pause = 1;
+ }
+
+ vmname = argv[0];
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ max_vcpus = num_vcpus_allowed(ctx);
+ if (guest_ncpus > max_vcpus) {
+ fprintf(stderr, "%d vCPUs requested but only %d available\n",
+ guest_ncpus, max_vcpus);
+ exit(1);
+ }
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr, "VM exit on HLT not supported\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
+ handler[VM_EXITCODE_HLT] = vmexit_hlt;
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ /*
+ * pause exit support required for this mode
+ */
+ err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr,
+ "SMP mux requested, no pause support\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
+ handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+ }
+
+ if (fbsdrun_disable_x2apic())
+ err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED);
+ else
+ err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED);
+
+ if (err) {
+ fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
+ exit(1);
+ }
+
+ if (lomem_sz != 0) {
+ lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
+ if (lomem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ } else if (himem_sz != 0) {
+ himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
+ if (himem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ himem_sz = 0;
+ }
+ }
+ }
+
+ init_inout();
+ init_pci(ctx);
+ if (ioapic)
+ ioapic_init(0);
+
+ if (gdb_port != 0)
+ init_dbgport(gdb_port);
+
+ if (bvmcons)
+ init_bvmcons();
+
+ error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ if (inject_bkpt) {
+ error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
+ assert(error == 0);
+ }
+
+ /*
+ * build the guest tables, MP etc.
+ */
+ mptable_build(ctx, guest_ncpus, ioapic);
+
+ if (acpi) {
+ error = acpi_build(ctx, guest_ncpus, ioapic);
+ assert(error == 0);
+ }
+
+ /*
+ * Add CPU 0
+ */
+ fbsdrun_addcpu(ctx, BSP, rip);
+
+ /*
+ * Head off to the main event dispatch loop
+ */
+ mevent_dispatch();
+
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
new file mode 100644
index 0000000..45033b8
--- /dev/null
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _FBSDRUN_H_
+#define _FBSDRUN_H_
+
+#ifndef CTASSERT /* Allow lint to override */
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
+#endif
+
+struct vmctx;
+extern int guest_hz;
+extern int guest_tslice;
+extern int guest_ncpus;
+extern char *vmname;
+
+extern u_long lomem_sz, himem_sz;
+
+void *paddr_guest2host(uintptr_t);
+
+void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
+int fbsdrun_muxed(void);
+int fbsdrun_vmexit_on_hlt(void);
+int fbsdrun_vmexit_on_pause(void);
+int fbsdrun_disable_x2apic(void);
+#endif
diff --git a/usr.sbin/bhyve/consport.c b/usr.sbin/bhyve/consport.c
new file mode 100644
index 0000000..3915b6d
--- /dev/null
+++ b/usr.sbin/bhyve/consport.c
@@ -0,0 +1,140 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include "inout.h"
+
+#define BVM_CONSOLE_PORT 0x220
+#define BVM_CONS_SIG ('b' << 8 | 'v')
+
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ static int opened;
+
+ if (bytes == 2 && in) {
+ *eax = BVM_CONS_SIG;
+ return (0);
+ }
+
+ if (bytes != 4)
+ return (-1);
+
+ if (!opened) {
+ ttyopen();
+ opened = 1;
+ }
+
+ if (in)
+ *eax = ttyread();
+ else
+ ttywrite(*eax);
+
+ return (0);
+}
+
+static struct inout_port consport = {
+ "bvmcons",
+ BVM_CONSOLE_PORT,
+ IOPORT_F_INOUT,
+ console_handler
+};
+
+void
+init_bvmcons(void)
+{
+
+ register_inout(&consport);
+}
diff --git a/usr.sbin/bhyve/dbgport.c b/usr.sbin/bhyve/dbgport.c
new file mode 100644
index 0000000..034531c
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.c
@@ -0,0 +1,138 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "inout.h"
+#include "dbgport.h"
+
+#define BVM_DBG_PORT 0x224
+#define BVM_DBG_SIG ('B' << 8 | 'V')
+
+static int listen_fd, conn_fd;
+
+static struct sockaddr_in sin;
+
+static int
+dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ char ch;
+ int nwritten, nread, printonce;
+
+ if (bytes == 2 && in) {
+ *eax = BVM_DBG_SIG;
+ return (0);
+ }
+
+ if (bytes != 4)
+ return (-1);
+
+again:
+ printonce = 0;
+ while (conn_fd < 0) {
+ if (!printonce) {
+ printf("Waiting for connection from gdb\r\n");
+ printonce = 1;
+ }
+ conn_fd = accept(listen_fd, NULL, NULL);
+ if (conn_fd >= 0)
+ fcntl(conn_fd, F_SETFL, O_NONBLOCK);
+ else if (errno != EINTR)
+ perror("accept");
+ }
+
+ if (in) {
+ nread = read(conn_fd, &ch, 1);
+ if (nread == -1 && errno == EAGAIN)
+ *eax = -1;
+ else if (nread == 1)
+ *eax = ch;
+ else {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ } else {
+ ch = *eax;
+ nwritten = write(conn_fd, &ch, 1);
+ if (nwritten != 1) {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ }
+ return (0);
+}
+
+static struct inout_port dbgport = {
+ "bvmdbg",
+ BVM_DBG_PORT,
+ IOPORT_F_INOUT,
+ dbg_handler
+};
+
+void
+init_dbgport(int sport)
+{
+ conn_fd = -1;
+
+ if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(sport);
+
+ if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(listen_fd, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+
+ register_inout(&dbgport);
+}
diff --git a/usr.sbin/bhyve/dbgport.h b/usr.sbin/bhyve/dbgport.h
new file mode 100644
index 0000000..8c7dab7
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DBGPORT_H_
+#define _DBGPORT_H_
+
+#define DEFAULT_GDB_PORT 6466
+
+void init_dbgport(int port);
+
+#endif
diff --git a/usr.sbin/bhyve/elcr.c b/usr.sbin/bhyve/elcr.c
new file mode 100644
index 0000000..2417ae1
--- /dev/null
+++ b/usr.sbin/bhyve/elcr.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "inout.h"
+
+/*
+ * EISA interrupt Level Control Register.
+ *
+ * This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
+ * A level triggered irq is indicated by setting the corresponding bit to '1'.
+ */
+#define ELCR_PORT 0x4d0
+
+static uint8_t elcr[2] = { 0x00, 0x00 };
+
+static int
+elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int idx;
+
+ if (bytes != 1)
+ return (-1);
+
+ idx = port - ELCR_PORT;
+
+ if (in)
+ *eax = elcr[idx];
+ else
+ elcr[idx] = *eax;
+
+ return (0);
+}
+INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
+INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
new file mode 100644
index 0000000..5f47a89f
--- /dev/null
+++ b/usr.sbin/bhyve/inout.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define MAX_IOPORTS (1 << 16)
+
+static struct {
+ const char *name;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+} inout_handlers[MAX_IOPORTS];
+
+static int
+default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (in) {
+ switch (bytes) {
+ case 4:
+ *eax = 0xffffffff;
+ break;
+ case 2:
+ *eax = 0xffff;
+ break;
+ case 1:
+ *eax = 0xff;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, int strict)
+{
+ int flags;
+ uint32_t mask;
+ inout_func_t handler;
+ void *arg;
+
+ assert(port < MAX_IOPORTS);
+
+ handler = inout_handlers[port].handler;
+
+ if (strict && handler == default_inout)
+ return (-1);
+
+ if (!in) {
+ switch (bytes) {
+ case 1:
+ mask = 0xff;
+ break;
+ case 2:
+ mask = 0xffff;
+ break;
+ default:
+ mask = 0xffffffff;
+ break;
+ }
+ *eax = *eax & mask;
+ }
+
+ flags = inout_handlers[port].flags;
+ arg = inout_handlers[port].arg;
+
+ if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
+ return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
+ else
+ return (-1);
+}
+
+void
+init_inout(void)
+{
+ struct inout_port **iopp, *iop;
+ int i;
+
+ /*
+ * Set up the default handler for all ports
+ */
+ for (i = 0; i < MAX_IOPORTS; i++) {
+ inout_handlers[i].name = "default";
+ inout_handlers[i].flags = IOPORT_F_IN | IOPORT_F_OUT;
+ inout_handlers[i].handler = default_inout;
+ inout_handlers[i].arg = NULL;
+ }
+
+ /*
+ * Overwrite with specified handlers
+ */
+ SET_FOREACH(iopp, inout_port_set) {
+ iop = *iopp;
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = NULL;
+ }
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = iop->arg;
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
new file mode 100644
index 0000000..a73b78d
--- /dev/null
+++ b/usr.sbin/bhyve/inout.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _INOUT_H_
+#define _INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+ const char *name;
+ int port;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+};
+#define IOPORT_F_IN 0x1
+#define IOPORT_F_OUT 0x2
+#define IOPORT_F_INOUT 0x3
+
+#define INOUT_PORT(name, port, flags, handler) \
+ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
+ #name, \
+ (port), \
+ (flags), \
+ (handler), \
+ 0 \
+ }; \
+ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+
+void init_inout(void);
+int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, int strict);
+int register_inout(struct inout_port *iop);
+
+void init_bvmcons(void);
+
+#endif /* _INOUT_H_ */
diff --git a/usr.sbin/bhyve/ioapic.c b/usr.sbin/bhyve/ioapic.c
new file mode 100644
index 0000000..c712692
--- /dev/null
+++ b/usr.sbin/bhyve/ioapic.c
@@ -0,0 +1,324 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <x86/apicreg.h>
+#include <machine/vmm.h>
+
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <vmmapi.h>
+
+#include "inout.h"
+#include "mem.h"
+#include "bhyverun.h"
+
+#include <stdio.h>
+
+#define IOAPIC_PADDR 0xFEC00000
+
+#define IOREGSEL 0x00
+#define IOWIN 0x10
+
+#define REDIR_ENTRIES 16
+#define INTR_ASSERTED(ioapic, pin) ((ioapic)->pinstate[(pin)] == true)
+
+struct ioapic {
+ int inited;
+ uint32_t id;
+ uint64_t redtbl[REDIR_ENTRIES];
+ bool pinstate[REDIR_ENTRIES];
+
+ uintptr_t paddr; /* gpa where the ioapic is mapped */
+ uint32_t ioregsel;
+ struct memory_region *region;
+};
+
+static struct ioapic ioapics[1]; /* only a single ioapic for now */
+
+static int ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr,
+ int size, uint64_t *data);
+static int ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr,
+ int size, uint64_t data);
+static int ioapic_region_handler(struct vmctx *vm, int vcpu, int dir,
+ uintptr_t paddr, int size, uint64_t *val,
+ void *arg1, long arg2);
+
+static void
+ioapic_set_pinstate(struct vmctx *ctx, int pin, bool newstate)
+{
+ int vector, apicid, vcpu;
+ uint32_t low, high;
+ struct ioapic *ioapic;
+
+ ioapic = &ioapics[0]; /* assume a single ioapic */
+
+ if (pin < 0 || pin >= REDIR_ENTRIES)
+ return;
+
+ /* Nothing to do if interrupt pin has not changed state */
+ if (ioapic->pinstate[pin] == newstate)
+ return;
+
+ ioapic->pinstate[pin] = newstate; /* record it */
+
+ /* Nothing to do if interrupt pin is deasserted */
+ if (!INTR_ASSERTED(ioapic, pin))
+ return;
+
+ /*
+ * XXX
+ * We only deal with:
+ * - edge triggered interrupts
+ * - physical destination mode
+ * - fixed delivery mode
+ */
+ low = ioapic->redtbl[pin];
+ high = ioapic->redtbl[pin] >> 32;
+ if ((low & IOART_INTMASK) == IOART_INTMCLR &&
+ (low & IOART_TRGRMOD) == IOART_TRGREDG &&
+ (low & IOART_DESTMOD) == IOART_DESTPHY &&
+ (low & IOART_DELMOD) == IOART_DELFIXED) {
+ vector = low & IOART_INTVEC;
+ apicid = high >> APIC_ID_SHIFT;
+ if (apicid != 0xff) {
+ /* unicast */
+ vcpu = vm_apicid2vcpu(ctx, apicid);
+ vm_lapic_irq(ctx, vcpu, vector);
+ } else {
+ /* broadcast */
+ vcpu = 0;
+ while (vcpu < guest_ncpus) {
+ vm_lapic_irq(ctx, vcpu, vector);
+ vcpu++;
+ }
+ }
+ }
+}
+
+void
+ioapic_deassert_pin(struct vmctx *ctx, int pin)
+{
+ ioapic_set_pinstate(ctx, pin, false);
+}
+
+void
+ioapic_assert_pin(struct vmctx *ctx, int pin)
+{
+ ioapic_set_pinstate(ctx, pin, true);
+}
+
+void
+ioapic_init(int which)
+{
+ struct mem_range memp;
+ struct ioapic *ioapic;
+ int error;
+ int i;
+
+ assert(which == 0);
+
+ ioapic = &ioapics[which];
+ assert(ioapic->inited == 0);
+
+ bzero(ioapic, sizeof(struct ioapic));
+
+ /* Initialize all redirection entries to mask all interrupts */
+ for (i = 0; i < REDIR_ENTRIES; i++)
+ ioapic->redtbl[i] = 0x0001000000010000UL;
+
+ ioapic->paddr = IOAPIC_PADDR;
+
+ /* Register emulated memory region */
+ memp.name = "ioapic";
+ memp.flags = MEM_F_RW;
+ memp.handler = ioapic_region_handler;
+ memp.arg1 = ioapic;
+ memp.arg2 = which;
+ memp.base = ioapic->paddr;
+ memp.size = sizeof(struct IOAPIC);
+ error = register_mem(&memp);
+
+ assert (error == 0);
+
+ ioapic->inited = 1;
+}
+
+static uint32_t
+ioapic_read(struct ioapic *ioapic, uint32_t addr)
+{
+ int regnum, pin, rshift;
+
+ assert(ioapic->inited);
+
+ regnum = addr & 0xff;
+ switch (regnum) {
+ case IOAPIC_ID:
+ return (ioapic->id);
+ break;
+ case IOAPIC_VER:
+ return ((REDIR_ENTRIES << MAXREDIRSHIFT) | 0x11);
+ break;
+ case IOAPIC_ARB:
+ return (ioapic->id);
+ break;
+ default:
+ break;
+ }
+
+ /* redirection table entries */
+ if (regnum >= IOAPIC_REDTBL &&
+ regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+ pin = (regnum - IOAPIC_REDTBL) / 2;
+ if ((regnum - IOAPIC_REDTBL) % 2)
+ rshift = 32;
+ else
+ rshift = 0;
+
+ return (ioapic->redtbl[pin] >> rshift);
+ }
+
+ return (0);
+}
+
+static void
+ioapic_write(struct ioapic *ioapic, uint32_t addr, uint32_t data)
+{
+ int regnum, pin, lshift;
+
+ assert(ioapic->inited);
+
+ regnum = addr & 0xff;
+ switch (regnum) {
+ case IOAPIC_ID:
+ ioapic->id = data & APIC_ID_MASK;
+ break;
+ case IOAPIC_VER:
+ case IOAPIC_ARB:
+ /* readonly */
+ break;
+ default:
+ break;
+ }
+
+ /* redirection table entries */
+ if (regnum >= IOAPIC_REDTBL &&
+ regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+ pin = (regnum - IOAPIC_REDTBL) / 2;
+ if ((regnum - IOAPIC_REDTBL) % 2)
+ lshift = 32;
+ else
+ lshift = 0;
+
+ ioapic->redtbl[pin] &= ~((uint64_t)0xffffffff << lshift);
+ ioapic->redtbl[pin] |= ((uint64_t)data << lshift);
+ }
+}
+
+static int
+ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr, int size,
+ uint64_t *data)
+{
+ int offset;
+
+ offset = paddr - ioapic->paddr;
+
+ /*
+ * The IOAPIC specification allows 32-bit wide accesses to the
+ * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+ */
+ if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+#if 1
+ printf("invalid access to ioapic%d: size %d, offset %d\n",
+ (int)(ioapic - ioapics), size, offset);
+#endif
+ *data = 0;
+ return (0);
+ }
+
+ if (offset == IOREGSEL)
+ *data = ioapic->ioregsel;
+ else
+ *data = ioapic_read(ioapic, ioapic->ioregsel);
+
+ return (0);
+}
+
+static int
+ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr, int size,
+ uint64_t data)
+{
+ int offset;
+
+ offset = paddr - ioapic->paddr;
+
+ /*
+ * The ioapic specification allows 32-bit wide accesses to the
+ * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+ */
+ if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+#if 1
+ printf("invalid access to ioapic%d: size %d, offset %d\n",
+ (int)(ioapic - ioapics), size, offset);
+#endif
+ return (0);
+ }
+
+ if (offset == IOREGSEL)
+ ioapic->ioregsel = data;
+ else
+ ioapic_write(ioapic, ioapic->ioregsel, data);
+
+ return (0);
+}
+
+static int
+ioapic_region_handler(struct vmctx *vm, int vcpu, int dir, uintptr_t paddr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct ioapic *ioapic;
+ int which;
+
+ ioapic = arg1;
+ which = arg2;
+
+ assert(ioapic == &ioapics[which]);
+
+ if (dir == MEM_F_READ)
+ ioapic_region_read(ioapic, paddr, size, val);
+ else
+ ioapic_region_write(ioapic, paddr, size, *val);
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/ioapic.h b/usr.sbin/bhyve/ioapic.h
new file mode 100644
index 0000000..4696f9a
--- /dev/null
+++ b/usr.sbin/bhyve/ioapic.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IOAPIC_H_
+#define _IOAPIC_H_
+
+struct vmctx;
+
+void ioapic_init(int num);
+void ioapic_deassert_pin(struct vmctx *ctx, int pin);
+void ioapic_assert_pin(struct vmctx *ctx, int pin);
+
+#endif
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
new file mode 100644
index 0000000..27f4782
--- /dev/null
+++ b/usr.sbin/bhyve/mem.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Memory ranges are represented with an RB tree. On insertion, the range
+ * is checked for overlaps. On lookup, the key has the same base and limit
+ * so it can be searched within the range.
+ *
+ * It is assumed that all setup of ranges takes place in single-threaded
+ * mode before vCPUs have been started. As such, no locks are used on the
+ * RB tree. If this is no longer the case, then a r/w lock could be used,
+ * with readers on the lookup and a writer if the tree needs to be changed
+ * (and per vCPU caches flushed)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/tree.h>
+#include <sys/errno.h>
+#include <machine/vmm.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "mem.h"
+
+struct mmio_rb_range {
+ RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
+ struct mem_range mr_param;
+ uint64_t mr_base;
+ uint64_t mr_end;
+};
+
+struct mmio_rb_tree;
+RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rbroot;
+
+/*
+ * Per-vCPU cache. Since most accesses from a vCPU will be to
+ * consecutive addresses in a range, it makes sense to cache the
+ * result of a lookup.
+ */
+static struct mmio_rb_range *mmio_hint[VM_MAXCPU];
+
+static int
+mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
+{
+ if (a->mr_end < b->mr_base)
+ return (-1);
+ else if (a->mr_base > b->mr_end)
+ return (1);
+ return (0);
+}
+
+static int
+mmio_rb_lookup(uint64_t addr, struct mmio_rb_range **entry)
+{
+ struct mmio_rb_range find, *res;
+
+ find.mr_base = find.mr_end = addr;
+
+ res = RB_FIND(mmio_rb_tree, &mmio_rbroot, &find);
+
+ if (res != NULL) {
+ *entry = res;
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+static int
+mmio_rb_add(struct mmio_rb_range *new)
+{
+ struct mmio_rb_range *overlap;
+
+ overlap = RB_INSERT(mmio_rb_tree, &mmio_rbroot, new);
+
+ if (overlap != NULL) {
+#ifdef RB_DEBUG
+ printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
+ new->mr_base, new->mr_end,
+ overlap->mr_base, overlap->mr_end);
+#endif
+
+ return (EEXIST);
+ }
+
+ return (0);
+}
+
+#if 0
+static void
+mmio_rb_dump(void)
+{
+ struct mmio_rb_range *np;
+
+ RB_FOREACH(np, mmio_rb_tree, &mmio_rbroot) {
+ printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
+ np->mr_param.name);
+ }
+}
+#endif
+
+RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+static int
+mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
+ rval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+static int
+mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
+ &wval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+int
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
+{
+ struct mmio_rb_range *entry;
+ int err;
+
+ /*
+ * First check the per-vCPU cache
+ */
+ if (mmio_hint[vcpu] &&
+ paddr >= mmio_hint[vcpu]->mr_base &&
+ paddr <= mmio_hint[vcpu]->mr_end) {
+ entry = mmio_hint[vcpu];
+ } else
+ entry = NULL;
+
+ if (entry == NULL) {
+ if (mmio_rb_lookup(paddr, &entry))
+ return (ESRCH);
+
+ /* Update the per-vCPU cache */
+ mmio_hint[vcpu] = entry;
+ }
+
+ assert(entry != NULL && entry == mmio_hint[vcpu]);
+
+ err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+ mem_read, mem_write, &entry->mr_param);
+ return (err);
+}
+
+int
+register_mem(struct mem_range *memp)
+{
+ struct mmio_rb_range *mrp;
+ int err;
+
+ err = 0;
+
+ mrp = malloc(sizeof(struct mmio_rb_range));
+
+ if (mrp != NULL) {
+ mrp->mr_param = *memp;
+ mrp->mr_base = memp->base;
+ mrp->mr_end = memp->base + memp->size - 1;
+
+ err = mmio_rb_add(mrp);
+ if (err)
+ free(mrp);
+ } else
+ err = ENOMEM;
+
+ return (err);
+}
+
+void
+init_mem(void)
+{
+
+ RB_INIT(&mmio_rbroot);
+}
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
new file mode 100644
index 0000000..88fafe1
--- /dev/null
+++ b/usr.sbin/bhyve/mem.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEM_H_
+#define _MEM_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2);
+
+struct mem_range {
+ const char *name;
+ int flags;
+ mem_func_t handler;
+ void *arg1;
+ long arg2;
+ uint64_t base;
+ uint64_t size;
+};
+#define MEM_F_READ 0x1
+#define MEM_F_WRITE 0x2
+#define MEM_F_RW 0x3
+
+void init_mem(void);
+int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
+
+int register_mem(struct mem_range *memp);
+
+#endif /* _MEM_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
new file mode 100644
index 0000000..a6109db
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.c
@@ -0,0 +1,432 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "mevent.h"
+
+#define MEVENT_MAX 64
+
+#define MEV_ENABLE 1
+#define MEV_DISABLE 2
+#define MEV_DEL_PENDING 3
+
+extern char *vmname;
+
+static pthread_t mevent_tid;
+static int mevent_pipefd[2];
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+ void (*me_func)(int, enum ev_type, void *);
+ int me_fd;
+ enum ev_type me_type;
+ void *me_param;
+ int me_cq;
+ int me_state;
+ int me_closefd;
+ LIST_ENTRY(mevent) me_list;
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+ pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+ pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type, void *param)
+{
+ char buf[MEVENT_MAX];
+ int status;
+
+ /*
+ * Drain the pipe read side. The fd is non-blocking so this is
+ * safe to do.
+ */
+ do {
+ status = read(fd, buf, sizeof(buf));
+ } while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+ char c;
+
+ /*
+ * If calling from outside the i/o thread, write a byte on the
+ * pipe to force the i/o thread to exit the blocking kevent call.
+ */
+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+ write(mevent_pipefd[1], &c, 1);
+ }
+}
+
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ if (mevp->me_type == EVF_READ)
+ retval = EVFILT_READ;
+
+ if (mevp->me_type == EVF_WRITE)
+ retval = EVFILT_WRITE;
+
+ return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+ int ret;
+
+ switch (mevp->me_state) {
+ case MEV_ENABLE:
+ ret = EV_ADD;
+ break;
+ case MEV_DISABLE:
+ ret = EV_DISABLE;
+ break;
+ case MEV_DEL_PENDING:
+ ret = EV_DELETE;
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+ /* XXX nothing yet, perhaps EV_EOF for reads ? */
+ return (0);
+}
+
+static int
+mevent_build(int mfd, struct kevent *kev)
+{
+ struct mevent *mevp, *tmpp;
+ int i;
+
+ i = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ close(mevp->me_fd);
+ } else {
+ kev[i].ident = mevp->me_fd;
+ kev[i].filter = mevent_kq_filter(mevp);
+ kev[i].flags = mevent_kq_flags(mevp);
+ kev[i].fflags = mevent_kq_fflags(mevp);
+ kev[i].data = 0;
+ kev[i].udata = mevp;
+ i++;
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state == MEV_DEL_PENDING) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+
+ assert(i < MEVENT_MAX);
+ }
+
+ mevent_qunlock();
+
+ return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+ struct mevent *mevp;
+ int i;
+
+ for (i = 0; i < numev; i++) {
+ mevp = kev[i].udata;
+
+ /* XXX check for EV_ERROR ? */
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+ }
+}
+
+struct mevent *
+mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *), void *param)
+{
+ struct mevent *lp, *mevp;
+
+ if (fd < 0 || func == NULL) {
+ return (NULL);
+ }
+
+ mevp = NULL;
+
+ mevent_qlock();
+
+ /*
+ * Verify that the fd/type tuple is not present in any list
+ */
+ LIST_FOREACH(lp, &global_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ LIST_FOREACH(lp, &change_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate an entry, populate it, and add it to the change list.
+ */
+ mevp = malloc(sizeof(struct mevent));
+ if (mevp == NULL) {
+ goto exit;
+ }
+
+ memset(mevp, 0, sizeof(struct mevent));
+ mevp->me_fd = fd;
+ mevp->me_type = type;
+ mevp->me_func = func;
+ mevp->me_param = param;
+
+ LIST_INSERT_HEAD(&change_head, mevp, me_list);
+ mevp->me_cq = 1;
+ mevp->me_state = MEV_ENABLE;
+ mevent_notify();
+
+exit:
+ mevent_qunlock();
+
+ return (mevp);
+}
+
+static int
+mevent_update(struct mevent *evp, int newstate)
+{
+ /*
+ * It's not possible to enable/disable a deleted event
+ */
+ if (evp->me_state == MEV_DEL_PENDING)
+ return (EINVAL);
+
+ /*
+ * No update needed if state isn't changing
+ */
+ if (evp->me_state == newstate)
+ return (0);
+
+ mevent_qlock();
+
+ evp->me_state = newstate;
+
+ /*
+ * Place the entry onto the changed list if not already there.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_ENABLE));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_DISABLE));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+ mevent_qlock();
+
+ /*
+ * Place the entry onto the changed list if not already there, and
+ * mark as to be deleted.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ evp->me_state = MEV_DEL_PENDING;
+
+ if (closefd)
+ evp->me_closefd = 1;
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 1));
+}
+
+static void
+mevent_set_name(void)
+{
+ char tname[MAXCOMLEN + 1];
+
+ snprintf(tname, sizeof(tname), "%s mevent", vmname);
+ pthread_set_name_np(mevent_tid, tname);
+}
+
+void
+mevent_dispatch(void)
+{
+ struct kevent changelist[MEVENT_MAX];
+ struct kevent eventlist[MEVENT_MAX];
+ struct mevent *pipev;
+ int mfd;
+ int numev;
+ int ret;
+
+ mevent_tid = pthread_self();
+ mevent_set_name();
+
+ mfd = kqueue();
+ assert(mfd > 0);
+
+ /*
+ * Open the pipe that will be used for other threads to force
+ * the blocking kqueue call to exit by writing to it. Set the
+ * descriptor to non-blocking.
+ */
+ ret = pipe(mevent_pipefd);
+ if (ret < 0) {
+ perror("pipe");
+ exit(0);
+ }
+
+ /*
+ * Add internal event handler for the pipe write fd
+ */
+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+ assert(pipev != NULL);
+
+ for (;;) {
+ /*
+ * Build changelist if required.
+ * XXX the changelist can be put into the blocking call
+ * to eliminate the extra syscall. Currently better for
+ * debug.
+ */
+ numev = mevent_build(mfd, changelist);
+ if (numev) {
+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent change");
+ }
+ }
+
+ /*
+ * Block awaiting events
+ */
+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent monitor");
+ }
+
+ /*
+ * Handle reported events
+ */
+ mevent_handle(eventlist, ret);
+ }
+}
diff --git a/usr.sbin/bhyve/mevent.h b/usr.sbin/bhyve/mevent.h
new file mode 100644
index 0000000..32a9d74
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEVENT_H_
+#define _MEVENT_H_
+
+enum ev_type {
+ EVF_READ,
+ EVF_WRITE
+};
+
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *),
+ void *param);
+int mevent_enable(struct mevent *evp);
+int mevent_disable(struct mevent *evp);
+int mevent_delete(struct mevent *evp);
+int mevent_delete_close(struct mevent *evp);
+
+void mevent_dispatch(void);
+
+#endif /* _MEVENT_H_ */
diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c
new file mode 100644
index 0000000..c72a497
--- /dev/null
+++ b/usr.sbin/bhyve/mevent_test.c
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Test program for the micro event library. Set up a simple TCP echo
+ * service.
+ *
+ * cc mevent_test.c mevent.c -lpthread
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "mevent.h"
+
+#define TEST_PORT 4321
+
+static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
+
+#define MEVENT_ECHO
+
+#ifdef MEVENT_ECHO
+struct esync {
+ pthread_mutex_t e_mt;
+ pthread_cond_t e_cond;
+};
+
+static void
+echoer_callback(int fd, enum ev_type type, void *param)
+{
+ struct esync *sync = param;
+
+ pthread_mutex_lock(&sync->e_mt);
+ pthread_cond_signal(&sync->e_cond);
+ pthread_mutex_unlock(&sync->e_mt);
+}
+
+static void *
+echoer(void *param)
+{
+ struct esync sync;
+ struct mevent *mev;
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ pthread_mutex_init(&sync.e_mt, NULL);
+ pthread_cond_init(&sync.e_cond, NULL);
+
+ pthread_mutex_lock(&sync.e_mt);
+
+ mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
+ if (mev == NULL) {
+ printf("Could not allocate echoer event\n");
+ exit(1);
+ }
+
+ while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
+ len = read(fd, buf, sizeof(buf));
+ if (len > 0) {
+ write(fd, buf, len);
+ write(0, buf, len);
+ } else {
+ break;
+ }
+ }
+
+ mevent_delete_close(mev);
+
+ pthread_mutex_unlock(&sync.e_mt);
+ pthread_mutex_destroy(&sync.e_mt);
+ pthread_cond_destroy(&sync.e_cond);
+}
+
+#else
+
+static void *
+echoer(void *param)
+{
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ while ((len = read(fd, buf, sizeof(buf))) > 0) {
+ write(1, buf, len);
+ }
+}
+#endif /* MEVENT_ECHO */
+
+static void
+acceptor_callback(int fd, enum ev_type type, void *param)
+{
+ pthread_mutex_lock(&accept_mutex);
+ pthread_cond_signal(&accept_condvar);
+ pthread_mutex_unlock(&accept_mutex);
+}
+
+static void *
+acceptor(void *param)
+{
+ struct sockaddr_in sin;
+ pthread_t tid;
+ int news;
+ int s;
+
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(TEST_PORT);
+
+ if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(s, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+
+ (void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
+
+ pthread_mutex_lock(&accept_mutex);
+
+ while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
+ news = accept(s, NULL, NULL);
+ if (news < 0) {
+ perror("accept error");
+ } else {
+ printf("incoming connection, spawning thread\n");
+ pthread_create(&tid, NULL, echoer,
+ (void *)(uintptr_t)news);
+ }
+ }
+}
+
+main()
+{
+ pthread_t tid;
+
+ pthread_create(&tid, NULL, acceptor, NULL);
+
+ mevent_dispatch();
+}
diff --git a/usr.sbin/bhyve/mptbl.c b/usr.sbin/bhyve/mptbl.c
new file mode 100644
index 0000000..52790f3
--- /dev/null
+++ b/usr.sbin/bhyve/mptbl.c
@@ -0,0 +1,398 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <x86/mptable.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "bhyverun.h"
+#include "mptbl.h"
+
+#define MPTABLE_BASE 0xF0000
+
+#define LAPIC_PADDR 0xFEE00000
+#define LAPIC_VERSION 16
+
+#define IOAPIC_PADDR 0xFEC00000
+#define IOAPIC_VERSION 0x11
+
+#define MP_SPECREV 4
+#define MPFP_SIG "_MP_"
+
+/* Configuration header defines */
+#define MPCH_SIG "PCMP"
+#define MPCH_OEMID "BHyVe "
+#define MPCH_OEMID_LEN 8
+#define MPCH_PRODID "Hypervisor "
+#define MPCH_PRODID_LEN 12
+
+/* Processor entry defines */
+#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */
+#define MPEP_SIG_MODEL 26
+#define MPEP_SIG_STEPPING 5
+#define MPEP_SIG \
+ ((MPEP_SIG_FAMILY << 8) | \
+ (MPEP_SIG_MODEL << 4) | \
+ (MPEP_SIG_STEPPING))
+
+#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */
+
+/* Define processor entry struct since <x86/mptable.h> gets it wrong */
+typedef struct BPROCENTRY {
+ u_char type;
+ u_char apic_id;
+ u_char apic_version;
+ u_char cpu_flags;
+ uint32_t cpu_signature;
+ uint32_t feature_flags;
+ uint32_t reserved1;
+ uint32_t reserved2;
+} *bproc_entry_ptr;
+CTASSERT(sizeof(struct BPROCENTRY) == 20);
+
+/* Bus entry defines */
+#define MPE_NUM_BUSES 2
+#define MPE_BUSNAME_LEN 6
+#define MPE_BUSNAME_ISA "ISA "
+#define MPE_BUSNAME_PCI "PCI "
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static uint8_t
+mpt_compute_checksum(void *base, size_t len)
+{
+ uint8_t *bytes;
+ uint8_t sum;
+
+ for(bytes = base, sum = 0; len > 0; len--) {
+ sum += *bytes++;
+ }
+
+ return (256 - sum);
+}
+
+static void
+mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
+{
+
+ memset(mpfp, 0, sizeof(*mpfp));
+ memcpy(mpfp->signature, MPFP_SIG, 4);
+ mpfp->pap = gpa + sizeof(*mpfp);
+ mpfp->length = 1;
+ mpfp->spec_rev = MP_SPECREV;
+ mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
+}
+
+static void
+mpt_build_mpch(mpcth_t mpch)
+{
+
+ memset(mpch, 0, sizeof(*mpch));
+ memcpy(mpch->signature, MPCH_SIG, 4);
+ mpch->spec_rev = MP_SPECREV;
+ memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
+ memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
+ mpch->apic_address = LAPIC_PADDR;
+}
+
+static void
+mpt_build_proc_entries(bproc_entry_ptr mpep, int ncpu)
+{
+ int i;
+
+ for (i = 0; i < ncpu; i++) {
+ memset(mpep, 0, sizeof(*mpep));
+ mpep->type = MPCT_ENTRY_PROCESSOR;
+ mpep->apic_id = i; // XXX
+ mpep->apic_version = LAPIC_VERSION;
+ mpep->cpu_flags = PROCENTRY_FLAG_EN;
+ if (i == 0)
+ mpep->cpu_flags |= PROCENTRY_FLAG_BP;
+ mpep->cpu_signature = MPEP_SIG;
+ mpep->feature_flags = MPEP_FEATURES;
+ mpep++;
+ }
+}
+
+static void
+mpt_build_bus_entries(bus_entry_ptr mpeb)
+{
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = ISA;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
+ mpeb++;
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = PCI;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
+}
+
+static void
+mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
+{
+
+ memset(mpei, 0, sizeof(*mpei));
+ mpei->type = MPCT_ENTRY_IOAPIC;
+ mpei->apic_id = id;
+ mpei->apic_version = IOAPIC_VERSION;
+ mpei->apic_flags = IOAPICENTRY_FLAG_EN;
+ mpei->apic_address = IOAPIC_PADDR;
+}
+
+#ifdef notyet
+static void
+mpt_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins, int id)
+{
+ int pin;
+
+ /*
+ * The following config is taken from kernel mptable.c
+ * mptable_parse_default_config_ints(...), for now
+ * just use the default config, tweek later if needed.
+ */
+
+
+ /* Run through all 16 pins. */
+ for (pin = 0; pin < num_pins; pin++) {
+ memset(mpeii, 0, sizeof(*mpeii));
+ mpeii->entry_type = MP_ENTRY_IOINT;
+ mpeii->src_bus_id = MPE_BUSID_ISA;
+ mpeii->dst_apic_id = id;
+
+ /*
+ * All default configs route IRQs from bus 0 to the first 16
+ * pins of the first I/O APIC with an APIC ID of 2.
+ */
+ mpeii->dst_apic_intin = pin;
+ switch (pin) {
+ case 0:
+ /* Pin 0 is an ExtINT pin. */
+ mpeii->intr_type = MPEII_INTR_EXTINT;
+ break;
+ case 2:
+ /* IRQ 0 is routed to pin 2. */
+ mpeii->intr_type = MPEII_INTR_INT;
+ mpeii->src_bus_irq = 0;
+ break;
+ case 5:
+ case 10:
+ case 11:
+ /*
+ * PCI Irqs set to level triggered.
+ */
+ mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL;
+ mpeii->src_bus_id = MPE_BUSID_PCI;
+ default:
+ /* All other pins are identity mapped. */
+ mpeii->intr_type = MPEII_INTR_INT;
+ mpeii->src_bus_irq = pin;
+ break;
+ }
+ mpeii++;
+ }
+
+}
+
+#define COPYSTR(dest, src, bytes) \
+ memcpy(dest, src, bytes); \
+ str[bytes] = 0;
+
+static void
+mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch)
+{
+ static char str[16];
+ int i;
+ char *cur;
+
+ union mpe {
+ struct mpe_proc *proc;
+ struct mpe_bus *bus;
+ struct mpe_ioapic *ioapic;
+ struct mpe_ioint *ioint;
+ struct mpe_lint *lnit;
+ char *p;
+ };
+
+ union mpe mpe;
+
+ printf(" MP Floating Pointer :\n");
+ COPYSTR(str, mpfp->signature, 4);
+ printf("\tsignature:\t%s\n", str);
+ printf("\tmpch paddr:\t%x\n", mpfp->mptable_paddr);
+ printf("\tlength:\t%x\n", mpfp->length);
+ printf("\tspecrec:\t%x\n", mpfp->specrev);
+ printf("\tchecksum:\t%x\n", mpfp->checksum);
+ printf("\tfeature1:\t%x\n", mpfp->feature1);
+ printf("\tfeature2:\t%x\n", mpfp->feature2);
+ printf("\tfeature3:\t%x\n", mpfp->feature3);
+ printf("\tfeature4:\t%x\n", mpfp->feature4);
+
+ printf(" MP Configuration Header :\n");
+ COPYSTR(str, mpch->signature, 4);
+ printf(" signature: %s\n", str);
+ printf(" length: %x\n", mpch->length);
+ printf(" specrec: %x\n", mpch->specrev);
+ printf(" checksum: %x\n", mpch->checksum);
+ COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN);
+ printf(" oemid: %s\n", str);
+ COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN);
+ printf(" prodid: %s\n", str);
+ printf(" oem_ptr: %x\n", mpch->oem_ptr);
+ printf(" oem_sz: %x\n", mpch->oem_sz);
+ printf(" nr_entries: %x\n", mpch->nr_entries);
+ printf(" apic paddr: %x\n", mpch->lapic_paddr);
+ printf(" ext_length: %x\n", mpch->ext_length);
+ printf(" ext_checksum: %x\n", mpch->ext_checksum);
+
+ cur = (char *)mpch + sizeof(*mpch);
+ for (i = 0; i < mpch->nr_entries; i++) {
+ mpe.p = cur;
+ switch(*mpe.p) {
+ case MP_ENTRY_PROC:
+ printf(" MP Processor Entry :\n");
+ printf(" lapic_id: %x\n", mpe.proc->lapic_id);
+ printf(" lapic_version: %x\n", mpe.proc->lapic_version);
+ printf(" proc_flags: %x\n", mpe.proc->proc_flags);
+ printf(" proc_signature: %x\n", mpe.proc->proc_signature);
+ printf(" feature_flags: %x\n", mpe.proc->feature_flags);
+ cur += sizeof(struct mpe_proc);
+ break;
+ case MP_ENTRY_BUS:
+ printf(" MP Bus Entry :\n");
+ printf(" busid: %x\n", mpe.bus->busid);
+ COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN);
+ printf(" busname: %s\n", str);
+ cur += sizeof(struct mpe_bus);
+ break;
+ case MP_ENTRY_IOAPIC:
+ printf(" MP IOAPIC Entry :\n");
+ printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id);
+ printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version);
+ printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags);
+ printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr);
+ cur += sizeof(struct mpe_ioapic);
+ break;
+ case MP_ENTRY_IOINT:
+ printf(" MP IO Interrupt Entry :\n");
+ printf(" intr_type: %x\n", mpe.ioint->intr_type);
+ printf(" intr_flags: %x\n", mpe.ioint->intr_flags);
+ printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id);
+ printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq);
+ printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id);
+ printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin);
+ cur += sizeof(struct mpe_ioint);
+ break;
+ case MP_ENTRY_LINT:
+ printf(" MP Local Interrupt Entry :\n");
+ cur += sizeof(struct mpe_lint);
+ break;
+ }
+
+ }
+}
+#endif
+
+void
+mptable_add_oemtbl(void *tbl, int tblsz)
+{
+
+ oem_tbl_start = tbl;
+ oem_tbl_size = tblsz;
+}
+
+int
+mptable_build(struct vmctx *ctx, int ncpu, int ioapic)
+{
+ mpcth_t mpch;
+ bus_entry_ptr mpeb;
+ io_apic_entry_ptr mpei;
+ bproc_entry_ptr mpep;
+ mpfps_t mpfp;
+ char *curraddr;
+ char *startaddr;
+
+ if (paddr_guest2host(0) == NULL) {
+ printf("mptable requires mapped mem\n");
+ return (ENOMEM);
+ }
+
+ startaddr = curraddr = paddr_guest2host(MPTABLE_BASE);
+
+ mpfp = (mpfps_t)curraddr;
+ mpt_build_mpfp(mpfp, MPTABLE_BASE);
+ curraddr += sizeof(*mpfp);
+
+ mpch = (mpcth_t)curraddr;
+ mpt_build_mpch(mpch);
+ curraddr += sizeof(*mpch);
+
+ mpep = (bproc_entry_ptr)curraddr;
+ mpt_build_proc_entries(mpep, ncpu);
+ curraddr += sizeof(*mpep) * ncpu;
+ mpch->entry_count += ncpu;
+
+ mpeb = (bus_entry_ptr) curraddr;
+ mpt_build_bus_entries(mpeb);
+ curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
+ mpch->entry_count += MPE_NUM_BUSES;
+
+ if (ioapic) {
+ mpei = (io_apic_entry_ptr)curraddr;
+ mpt_build_ioapic_entries(mpei, ncpu + 1);
+ curraddr += sizeof(*mpei);
+ mpch->entry_count++;
+ }
+
+#ifdef notyet
+ mpt_build_ioint_entries((struct mpe_ioint*)curraddr, MPEII_MAX_IRQ,
+ ncpu + 1);
+ curraddr += sizeof(struct mpe_ioint) * MPEII_MAX_IRQ;
+ mpch->entry_count += MPEII_MAX_IRQ;
+#endif
+
+ if (oem_tbl_start) {
+ mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
+ mpch->oem_table_size = oem_tbl_size;
+ memcpy(curraddr, oem_tbl_start, oem_tbl_size);
+ }
+
+ mpch->base_table_length = curraddr - (char *)mpch;
+ mpch->checksum = mpt_compute_checksum(mpch, sizeof(*mpch));
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/mptbl.h b/usr.sbin/bhyve/mptbl.h
new file mode 100644
index 0000000..3c4c527
--- /dev/null
+++ b/usr.sbin/bhyve/mptbl.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MPTBL_H_
+#define _MPTBL_H_
+
+int mptable_build(struct vmctx *ctx, int ncpu, int ioapic);
+void mptable_add_oemtbl(void *tbl, int tblsz);
+
+#endif /* _MPTBL_H_ */
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
new file mode 100644
index 0000000..e086aeb
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -0,0 +1,1117 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "mem.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "ioapic.h"
+
+#define CONF1_ADDR_PORT 0x0cf8
+#define CONF1_DATA_PORT 0x0cfc
+
+#define CFGWRITE(pi,off,val,b) \
+do { \
+ if ((b) == 1) { \
+ pci_set_cfgdata8((pi),(off),(val)); \
+ } else if ((b) == 2) { \
+ pci_set_cfgdata16((pi),(off),(val)); \
+ } else { \
+ pci_set_cfgdata32((pi),(off),(val)); \
+ } \
+} while (0)
+
+#define MAXSLOTS (PCI_SLOTMAX + 1)
+#define MAXFUNCS (PCI_FUNCMAX + 1)
+
+static struct slotinfo {
+ char *si_name;
+ char *si_param;
+ struct pci_devinst *si_devi;
+ int si_legacy;
+} pci_slotinfo[MAXSLOTS][MAXFUNCS];
+
+/*
+ * Used to keep track of legacy interrupt owners/requestors
+ */
+#define NLIRQ 16
+
+static struct lirqinfo {
+ int li_generic;
+ int li_acount;
+ struct pci_devinst *li_owner; /* XXX should be a list */
+} lirq[NLIRQ];
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define PCI_EMUL_IOBASE 0x2000
+#define PCI_EMUL_IOLIMIT 0x10000
+
+#define PCI_EMUL_MEMBASE32 (lomem_sz)
+#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
+
+#define PCI_EMUL_MEMBASE64 0xD000000000UL
+#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
+
+static int pci_emul_devices;
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ * <slot>[:<func>],<emul>[,<config>]
+ *
+ * slot is 0..31
+ * func is 0..7
+ * emul is a string describing the type of PCI device e.g. virtio-net
+ * config is an optional string, depending on the device, that can be
+ * used for configuration.
+ * Examples are:
+ * 1,virtio-net,tap0
+ * 3:0,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+ printf("Invalid PCI slot info field \"%s\"\n", aopt);
+ free(aopt);
+}
+
+void
+pci_parse_slot(char *opt, int legacy)
+{
+ char *slot, *func, *emul, *config;
+ char *str, *cpy;
+ int snum, fnum;
+
+ str = cpy = strdup(opt);
+
+ config = NULL;
+
+ if (strchr(str, ':') != NULL) {
+ slot = strsep(&str, ":");
+ func = strsep(&str, ",");
+ } else {
+ slot = strsep(&str, ",");
+ func = NULL;
+ }
+
+ emul = strsep(&str, ",");
+ if (str != NULL) {
+ config = strsep(&str, ",");
+ }
+
+ if (emul == NULL) {
+ pci_parse_slot_usage(cpy);
+ return;
+ }
+
+ snum = atoi(slot);
+ fnum = func ? atoi(func) : 0;
+ if (snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) {
+ pci_parse_slot_usage(cpy);
+ } else {
+ pci_slotinfo[snum][fnum].si_name = emul;
+ pci_slotinfo[snum][fnum].si_param = config;
+ pci_slotinfo[snum][fnum].si_legacy = legacy;
+ }
+}
+
+static int
+pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pdi = arg;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int i;
+
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ if (pdi->pi_bar[i].type == PCIBAR_IO &&
+ port >= pdi->pi_bar[i].addr &&
+ port + bytes <=
+ pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ offset = port - pdi->pi_bar[i].addr;
+ if (in)
+ *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
+ offset, bytes);
+ else
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
+ bytes, *eax);
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static int
+pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct pci_devinst *pdi = arg1;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int bidx = (int) arg2;
+
+ assert(bidx <= PCI_BARMAX);
+ assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
+ pdi->pi_bar[bidx].type == PCIBAR_MEM64);
+ assert(addr >= pdi->pi_bar[bidx].addr &&
+ addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+
+ offset = addr - pdi->pi_bar[bidx].addr;
+
+ if (dir == MEM_F_WRITE)
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val);
+ else
+ *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size);
+
+ return (0);
+}
+
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+ uint64_t *addr)
+{
+ uint64_t base;
+
+ assert((size & (size - 1)) == 0); /* must be a power of 2 */
+
+ base = roundup2(*baseptr, size);
+
+ if (base + size <= limit) {
+ *addr = base;
+ *baseptr = base + size;
+ return (0);
+ } else
+ return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+ uint64_t size)
+{
+
+ return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
+}
+
+int
+pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size)
+{
+ int i, error;
+ uint64_t *baseptr, limit, addr, mask, lobits, bar;
+ struct inout_port iop;
+ struct mem_range memp;
+
+ assert(idx >= 0 && idx <= PCI_BARMAX);
+
+ if ((size & (size - 1)) != 0)
+ size = 1UL << flsl(size); /* round up to a power of 2 */
+
+ switch (type) {
+ case PCIBAR_NONE:
+ baseptr = NULL;
+ addr = mask = lobits = 0;
+ break;
+ case PCIBAR_IO:
+ if (hostbase &&
+ pci_slotinfo[pdi->pi_slot][pdi->pi_func].si_legacy) {
+ assert(hostbase < PCI_EMUL_IOBASE);
+ baseptr = &hostbase;
+ } else {
+ baseptr = &pci_emul_iobase;
+ }
+ limit = PCI_EMUL_IOLIMIT;
+ mask = PCIM_BAR_IO_BASE;
+ lobits = PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM64:
+ /*
+ * XXX
+ * Some drivers do not work well if the 64-bit BAR is allocated
+ * above 4GB. Allow for this by allocating small requests under
+ * 4GB unless then allocation size is larger than some arbitrary
+ * number (32MB currently).
+ */
+ if (size > 32 * 1024 * 1024) {
+ /*
+ * XXX special case for device requiring peer-peer DMA
+ */
+ if (size == 0x100000000UL)
+ baseptr = &hostbase;
+ else
+ baseptr = &pci_emul_membase64;
+ limit = PCI_EMUL_MEMLIMIT64;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ } else {
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
+ }
+ break;
+ case PCIBAR_MEM32:
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ default:
+ printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+ assert(0);
+ }
+
+ if (baseptr != NULL) {
+ error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+ if (error != 0)
+ return (error);
+ }
+
+ pdi->pi_bar[idx].type = type;
+ pdi->pi_bar[idx].addr = addr;
+ pdi->pi_bar[idx].size = size;
+
+ /* Initialize the BAR register in config space */
+ bar = (addr & mask) | lobits;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+ if (type == PCIBAR_MEM64) {
+ assert(idx + 1 <= PCI_BARMAX);
+ pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+ }
+
+ /* add a handler to intercept accesses to the I/O bar */
+ if (type == PCIBAR_IO) {
+ iop.name = pdi->pi_name;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = pci_emul_io_handler;
+ iop.arg = pdi;
+
+ for (i = 0; i < size; i++) {
+ iop.port = addr + i;
+ register_inout(&iop);
+ }
+ } else if (type == PCIBAR_MEM32 || type == PCIBAR_MEM64) {
+ /* add memory bar intercept handler */
+ memp.name = pdi->pi_name;
+ memp.flags = MEM_F_RW;
+ memp.base = addr;
+ memp.size = size;
+ memp.handler = pci_emul_mem_handler;
+ memp.arg1 = pdi;
+ memp.arg2 = idx;
+
+ error = register_mem(&memp);
+ assert(error == 0);
+ }
+
+ return (0);
+}
+
+#define CAP_START_OFFSET 0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+ int i, capoff, capid, reallen;
+ uint16_t sts;
+
+ static u_char endofcap[4] = {
+ PCIY_RESERVED, 0, 0, 0
+ };
+
+ assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
+
+ reallen = roundup2(caplen, 4); /* dword aligned */
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+ capoff = CAP_START_OFFSET;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+ pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+ } else {
+ capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((capoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ capoff = pci_get_cfgdata8(pi, capoff + 1);
+ }
+ }
+
+ /* Check if we have enough space */
+ if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
+ return (-1);
+
+ /* Copy the capability */
+ for (i = 0; i < caplen; i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ /* Set the next capability pointer */
+ pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
+
+ /* Copy of the reserved capability which serves as the end marker */
+ for (i = 0; i < sizeof(endofcap); i++)
+ pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
+
+ return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ if (!strcmp(pdp->pe_emu, name)) {
+ return (pdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static void
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, int func,
+ char *params)
+{
+ struct pci_devinst *pdi;
+ pdi = malloc(sizeof(struct pci_devinst));
+ bzero(pdi, sizeof(*pdi));
+
+ pdi->pi_vmctx = ctx;
+ pdi->pi_bus = 0;
+ pdi->pi_slot = slot;
+ pdi->pi_func = func;
+ pdi->pi_d = pde;
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+ /* Disable legacy interrupts */
+ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+ pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+ pci_set_cfgdata8(pdi, PCIR_COMMAND,
+ PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+ if ((*pde->pe_init)(ctx, pdi, params) != 0) {
+ free(pdi);
+ } else {
+ pci_emul_devices++;
+ pci_slotinfo[slot][func].si_devi = pdi;
+ }
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+ int mmc;
+
+ CTASSERT(sizeof(struct msicap) == 14);
+
+ /* Number of msi messages must be a power of 2 between 1 and 32 */
+ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+ mmc = ffs(msgnum) - 1;
+
+ bzero(msicap, sizeof(struct msicap));
+ msicap->capid = PCIY_MSI;
+ msicap->nextptr = nextptr;
+ msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+ struct msicap msicap;
+
+ pci_populate_msicap(&msicap, msgnum, 0);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask;
+ int off, table_bar;
+
+ off = offset - capoff;
+ table_bar = pi->pi_msix.table_bar;
+ /* Message Control Register */
+ if (off == 2 && bytes == 2) {
+ rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask, msgdata, mme;
+ uint32_t addrlo;
+
+ /*
+ * If guest is writing to the message control register make sure
+ * we do not overwrite read-only fields.
+ */
+ if ((offset - capoff) == 2 && bytes == 2) {
+ rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ addrlo = pci_get_cfgdata32(pi, capoff + 4);
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ msgdata = pci_get_cfgdata16(pi, capoff + 12);
+ else
+ msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+ /*
+ * XXX check delivery mode, destination mode etc
+ */
+ mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+ pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+ if (pi->pi_msi.enabled) {
+ pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
+ pi->pi_msi.vector = msgdata & 0xff;
+ pi->pi_msi.msgnum = 1 << (mme >> 4);
+ } else {
+ pi->pi_msi.cpu = 0;
+ pi->pi_msi.vector = 0;
+ pi->pi_msi.msgnum = 0;
+ }
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+ int capid;
+ uint8_t capoff, nextoff;
+
+ /* Do not allow un-aligned writes */
+ if ((offset & (bytes - 1)) != 0)
+ return;
+
+ /* Find the capability that we want to update */
+ capoff = CAP_START_OFFSET;
+ while (1) {
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+
+ nextoff = pci_get_cfgdata8(pi, capoff + 1);
+ if (offset >= capoff && offset < nextoff)
+ break;
+
+ capoff = nextoff;
+ }
+ assert(offset >= capoff);
+
+ /*
+ * Capability ID and Next Capability Pointer are readonly
+ */
+ if (offset == capoff || offset == capoff + 1)
+ return;
+
+ switch (capid) {
+ case PCIY_MSI:
+ msicap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+ int found;
+ uint16_t sts;
+ uint8_t capid, lastoff;
+
+ found = 0;
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+ lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((lastoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, lastoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ lastoff = pci_get_cfgdata8(pi, lastoff + 1);
+ }
+ if (offset >= CAP_START_OFFSET && offset <= lastoff)
+ found = 1;
+ }
+ return (found);
+}
+
+void
+init_pci(struct vmctx *ctx)
+{
+ struct pci_devemu *pde;
+ struct slotinfo *si;
+ int slot, func;
+
+ pci_emul_iobase = PCI_EMUL_IOBASE;
+ pci_emul_membase32 = PCI_EMUL_MEMBASE32;
+ pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ for (func = 0; func < MAXFUNCS; func++) {
+ si = &pci_slotinfo[slot][func];
+ if (si->si_name != NULL) {
+ pde = pci_emul_finddev(si->si_name);
+ if (pde != NULL) {
+ pci_emul_init(ctx, pde, slot, func,
+ si->si_param);
+ }
+ }
+ }
+ }
+
+ /*
+ * Allow ISA IRQs 5,10,11,12, and 15 to be available for
+ * generic use
+ */
+ lirq[5].li_generic = 1;
+ lirq[10].li_generic = 1;
+ lirq[11].li_generic = 1;
+ lirq[12].li_generic = 1;
+ lirq[15].li_generic = 1;
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+ return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_msgnum(struct pci_devinst *pi)
+{
+ if (pi->pi_msi.enabled)
+ return (pi->pi_msi.msgnum);
+ else
+ return (0);
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int msg)
+{
+
+ if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
+ vm_lapic_irq(pi->pi_vmctx,
+ pi->pi_msi.cpu,
+ pi->pi_msi.vector + msg);
+ }
+}
+
+int
+pci_is_legacy(struct pci_devinst *pi)
+{
+
+ return (pci_slotinfo[pi->pi_slot][pi->pi_func].si_legacy);
+}
+
+static int
+pci_lintr_alloc(struct pci_devinst *pi, int vec)
+{
+ int i;
+
+ assert(vec < NLIRQ);
+
+ if (vec == -1) {
+ for (i = 0; i < NLIRQ; i++) {
+ if (lirq[i].li_generic &&
+ lirq[i].li_owner == NULL) {
+ vec = i;
+ break;
+ }
+ }
+ } else {
+ if (lirq[vec].li_owner != NULL) {
+ vec = -1;
+ }
+ }
+ assert(vec != -1);
+
+ lirq[vec].li_owner = pi;
+ pi->pi_lintr_pin = vec;
+
+ return (vec);
+}
+
+int
+pci_lintr_request(struct pci_devinst *pi, int vec)
+{
+
+ vec = pci_lintr_alloc(pi, vec);
+ pci_set_cfgdata8(pi, PCIR_INTLINE, vec);
+ pci_set_cfgdata8(pi, PCIR_INTPIN, 1);
+ return (0);
+}
+
+void
+pci_lintr_assert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr_pin);
+ ioapic_assert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
+}
+
+void
+pci_lintr_deassert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr_pin);
+ ioapic_deassert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
+}
+
+/*
+ * Return 1 if the emulated device in 'slot' is a multi-function device.
+ * Return 0 otherwise.
+ */
+static int
+pci_emul_is_mfdev(int slot)
+{
+ int f, numfuncs;
+
+ numfuncs = 0;
+ for (f = 0; f < MAXFUNCS; f++) {
+ if (pci_slotinfo[slot][f].si_devi != NULL) {
+ numfuncs++;
+ }
+ }
+ return (numfuncs > 1);
+}
+
+/*
+ * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
+ * whether or not is a multi-function being emulated in the pci 'slot'.
+ */
+static void
+pci_emul_hdrtype_fixup(int slot, int off, int bytes, uint32_t *rv)
+{
+ int mfdev;
+
+ if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
+ mfdev = pci_emul_is_mfdev(slot);
+ switch (bytes) {
+ case 1:
+ case 2:
+ *rv &= ~PCIM_MFDEV;
+ if (mfdev) {
+ *rv |= PCIM_MFDEV;
+ }
+ break;
+ case 4:
+ *rv &= ~(PCIM_MFDEV << 16);
+ if (mfdev) {
+ *rv |= (PCIM_MFDEV << 16);
+ }
+ break;
+ }
+ }
+}
+
+static int cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint32_t x;
+
+ assert(!in);
+
+ if (bytes != 4)
+ return (-1);
+
+ x = *eax;
+ cfgoff = x & PCI_REGMAX;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pi;
+ struct pci_devemu *pe;
+ int coff, idx, needcfg;
+ uint64_t mask, bar;
+
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ if (cfgbus == 0)
+ pi = pci_slotinfo[cfgslot][cfgfunc].si_devi;
+ else
+ pi = NULL;
+
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+
+#if 0
+ printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
+ in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
+#endif
+
+ /*
+ * Just return if there is no device at this cfgslot:cfgfunc or
+ * if the guest is doing an un-aligned access
+ */
+ if (pi == NULL || (coff & (bytes - 1)) != 0) {
+ if (in)
+ *eax = 0xffffffff;
+ return (0);
+ }
+
+ pe = pi->pi_d;
+
+ /*
+ * Config read
+ */
+ if (in) {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgread != NULL) {
+ needcfg = pe->pe_cfgread(ctx, vcpu, pi,
+ coff, bytes, eax);
+ } else {
+ needcfg = 1;
+ }
+
+ if (needcfg) {
+ if (bytes == 1)
+ *eax = pci_get_cfgdata8(pi, coff);
+ else if (bytes == 2)
+ *eax = pci_get_cfgdata16(pi, coff);
+ else
+ *eax = pci_get_cfgdata32(pi, coff);
+ }
+
+ pci_emul_hdrtype_fixup(cfgslot, coff, bytes, eax);
+ } else {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgwrite != NULL &&
+ (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+ return (0);
+
+ /*
+ * Special handling for write to BAR registers
+ */
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+ /*
+ * Ignore writes to BAR registers that are not
+ * 4-byte aligned.
+ */
+ if (bytes != 4 || (coff & 0x3) != 0)
+ return (0);
+ idx = (coff - PCIR_BAR(0)) / 4;
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_NONE:
+ bar = 0;
+ break;
+ case PCIBAR_IO:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_IO_BASE;
+ bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM32:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ case PCIBAR_MEM64:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ case PCIBAR_MEMHI64:
+ mask = ~(pi->pi_bar[idx - 1].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = ((uint64_t)*eax << 32) & mask;
+ bar = bar >> 32;
+ break;
+ default:
+ assert(0);
+ }
+ pci_set_cfgdata32(pi, coff, bar);
+
+ } else if (pci_emul_iscap(pi, coff)) {
+ pci_emul_capwrite(pi, coff, bytes, *eax);
+ } else {
+ CFGWRITE(pi, coff, *eax, bytes);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+/*
+ * I/O ports to configure PCI IRQ routing. We ignore all writes to it.
+ */
+static int
+pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+ return (0);
+}
+INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
+INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DIOSZ 20
+#define DMEMSZ 4096
+struct pci_emul_dsoftc {
+ uint8_t ioregs[DIOSZ];
+ uint8_t memregs[DMEMSZ];
+};
+
+#define PCI_EMUL_MSI_MSGS 4
+#define PCI_EMUL_MSIX_MSGS 16
+
+static int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error;
+ struct pci_emul_dsoftc *sc;
+
+ sc = malloc(sizeof(struct pci_emul_dsoftc));
+ memset(sc, 0, sizeof(struct pci_emul_dsoftc));
+
+ pi->pi_arg = sc;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+ pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+ error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ return (0);
+}
+
+static void
+pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ int i;
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("diow: iow too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->ioregs[offset] = value & 0xff;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->ioregs[offset] = value;
+ } else {
+ printf("diow: iow unknown size %d\n", size);
+ }
+
+ /*
+ * Special magic value to generate an interrupt
+ */
+ if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+ pci_generate_msi(pi, value % pci_msi_msgnum(pi));
+
+ if (value == 0xabcdef) {
+ for (i = 0; i < pci_msi_msgnum(pi); i++)
+ pci_generate_msi(pi, i);
+ }
+ }
+
+ if (baridx == 1) {
+ if (offset + size > DMEMSZ) {
+ printf("diow: memw too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->memregs[offset] = value;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->memregs[offset] = value;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->memregs[offset] = value;
+ } else if (size == 8) {
+ *(uint64_t *)&sc->memregs[offset] = value;
+ } else {
+ printf("diow: memw unknown size %d\n", size);
+ }
+
+ /*
+ * magic interrupt ??
+ */
+ }
+
+ if (baridx > 1) {
+ printf("diow: unknown bar idx %d\n", baridx);
+ }
+}
+
+static uint64_t
+pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("dior: ior too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->ioregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->ioregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->ioregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+ if (baridx == 1) {
+ if (offset + size > DMEMSZ) {
+ printf("dior: memr too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->memregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->memregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->memregs[offset];
+ } else if (size == 8) {
+ value = *(uint64_t *) &sc->memregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+
+ if (baridx > 1) {
+ printf("dior: unknown bar idx %d\n", baridx);
+ return (0);
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_dummy = {
+ .pe_emu = "dummy",
+ .pe_init = pci_emul_dinit,
+ .pe_barwrite = pci_emul_diow,
+ .pe_barread = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
new file mode 100644
index 0000000..e924475
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
+#define PCIY_RESERVED 0x00
+
+struct vmctx;
+struct pci_devinst;
+struct memory_region;
+
+struct pci_devemu {
+ char *pe_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*pe_init)(struct vmctx *, struct pci_devinst *,
+ char *opts);
+
+ /* config space read/write callbacks */
+ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t val);
+ int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t *retval);
+
+ /* BAR read/write callbacks */
+ void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value);
+ uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size);
+};
+#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+ PCIBAR_NONE,
+ PCIBAR_IO,
+ PCIBAR_MEM32,
+ PCIBAR_MEM64,
+ PCIBAR_MEMHI64
+};
+
+struct pcibar {
+ enum pcibar_type type; /* io or memory */
+ uint64_t size;
+ uint64_t addr;
+};
+
+#define PI_NAMESZ 40
+
+struct msix_table_entry {
+ uint64_t addr;
+ uint32_t msg_data;
+ uint32_t vector_control;
+} __packed;
+
+/*
+ * In case the structure is modified to hold extra information, use a define
+ * for the size that should be emulated.
+ */
+#define MSIX_TABLE_ENTRY_SIZE 16
+#define MAX_MSIX_TABLE_SIZE 2048
+
+struct pci_devinst {
+ struct pci_devemu *pi_d;
+ struct vmctx *pi_vmctx;
+ uint8_t pi_bus, pi_slot, pi_func;
+ uint8_t pi_lintr_pin;
+ char pi_name[PI_NAMESZ];
+ uint16_t pi_iobase;
+ int pi_bar_getsize;
+
+ struct {
+ int enabled;
+ int cpu;
+ int vector;
+ int msgnum;
+ } pi_msi;
+
+ struct {
+ int enabled;
+ int table_bar;
+ int pba_bar;
+ size_t table_offset;
+ size_t table_size;
+ int table_count;
+ size_t pba_offset;
+ struct msix_table_entry table[MAX_MSIX_TABLE_SIZE];
+ } pi_msix;
+
+ void *pi_arg; /* devemu-private data */
+
+ u_char pi_cfgdata[PCI_REGMAX + 1];
+ struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t addrlo;
+ uint32_t addrhi;
+ uint16_t msgdata;
+} __packed;
+
+struct msixcap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t table_offset;
+ uint32_t pba_offset;
+} __packed;
+
+void init_pci(struct vmctx *ctx);
+void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void pci_callback(void);
+int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
+ enum pcibar_type type, uint64_t size);
+int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
+ uint64_t hostbase, enum pcibar_type type, uint64_t size);
+int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+int pci_is_legacy(struct pci_devinst *pi);
+void pci_generate_msi(struct pci_devinst *pi, int msgnum);
+void pci_generate_msix(struct pci_devinst *pi, int msgnum);
+void pci_lintr_assert(struct pci_devinst *pi);
+void pci_lintr_deassert(struct pci_devinst *pi);
+int pci_lintr_request(struct pci_devinst *pi, int ivec);
+int pci_msi_enabled(struct pci_devinst *pi);
+int pci_msix_enabled(struct pci_devinst *pi);
+int pci_msi_msgnum(struct pci_devinst *pi);
+void pci_parse_slot(char *opt, int legacy);
+void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+
+static __inline void
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+ assert(offset <= PCI_REGMAX);
+ *(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ *(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ *(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= PCI_REGMAX);
+ return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c
new file mode 100644
index 0000000..c77762d
--- /dev/null
+++ b/usr.sbin/bhyve/pci_hostbridge.c
@@ -0,0 +1,52 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "pci_emul.h"
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /* config space */
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_hostbridge = {
+ .pe_emu = "hostbridge",
+ .pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
new file mode 100644
index 0000000..28abb6b
--- /dev/null
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -0,0 +1,724 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/pciio.h>
+#include <sys/ioctl.h>
+
+#include <dev/io/iodev.h>
+#include <machine/iodev.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+#include "pci_emul.h"
+#include "mem.h"
+
+#ifndef _PATH_DEVPCI
+#define _PATH_DEVPCI "/dev/pci"
+#endif
+
+#ifndef _PATH_DEVIO
+#define _PATH_DEVIO "/dev/io"
+#endif
+
+#define LEGACY_SUPPORT 1
+
+#define MSIX_TABLE_BIR_MASK 7
+#define MSIX_TABLE_OFFSET_MASK (~MSIX_TABLE_BIR_MASK);
+#define MSIX_TABLE_COUNT(x) (((x) & 0x7FF) + 1)
+#define MSIX_CAPLEN 12
+
+static int pcifd = -1;
+static int iofd = -1;
+
+struct passthru_softc {
+ struct pci_devinst *psc_pi;
+ struct pcibar psc_bar[PCI_BARMAX + 1];
+ struct {
+ int capoff;
+ int msgctrl;
+ int emulated;
+ } psc_msi;
+ struct {
+ int capoff;
+ } psc_msix;
+ struct pcisel psc_sel;
+};
+
+static int
+msi_caplen(int msgctrl)
+{
+ int len;
+
+ len = 10; /* minimum length of msi capability */
+
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ len += 4;
+
+#if 0
+ /*
+ * Ignore the 'mask' and 'pending' bits in the MSI capability.
+ * We'll let the guest manipulate them directly.
+ */
+ if (msgctrl & PCIM_MSICTRL_VECTOR)
+ len += 10;
+#endif
+
+ return (len);
+}
+
+static uint32_t
+read_config(const struct pcisel *sel, long reg, int width)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+
+ if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
+ return (0); /* XXX */
+ else
+ return (pi.pi_data);
+}
+
+static void
+write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+ pi.pi_data = data;
+
+ (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
+}
+
+#ifdef LEGACY_SUPPORT
+static int
+passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
+{
+ int capoff, i;
+ struct msicap msicap;
+ u_char *capdata;
+
+ pci_populate_msicap(&msicap, msgnum, nextptr);
+
+ /*
+ * XXX
+ * Copy the msi capability structure in the last 16 bytes of the
+ * config space. This is wrong because it could shadow something
+ * useful to the device.
+ */
+ capoff = 256 - roundup(sizeof(msicap), 4);
+ capdata = (u_char *)&msicap;
+ for (i = 0; i < sizeof(msicap); i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ return (capoff);
+}
+#endif /* LEGACY_SUPPORT */
+
+static int
+cfginitmsi(struct passthru_softc *sc)
+{
+ int ptr, capptr, cap, sts, caplen;
+ uint32_t u32;
+ struct pcisel sel;
+ struct pci_devinst *pi;
+ struct msixcap msixcap;
+ uint32_t *msixcap_ptr;
+
+ pi = sc->psc_pi;
+ sel = sc->psc_sel;
+
+ /*
+ * Parse the capabilities and cache the location of the MSI
+ * and MSI-X capabilities.
+ */
+ sts = read_config(&sel, PCIR_STATUS, 2);
+ if (sts & PCIM_STATUS_CAPPRESENT) {
+ ptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ while (ptr != 0 && ptr != 0xff) {
+ cap = read_config(&sel, ptr + PCICAP_ID, 1);
+ if (cap == PCIY_MSI) {
+ /*
+ * Copy the MSI capability into the config
+ * space of the emulated pci device
+ */
+ sc->psc_msi.capoff = ptr;
+ sc->psc_msi.msgctrl = read_config(&sel,
+ ptr + 2, 2);
+ sc->psc_msi.emulated = 0;
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+ capptr = ptr;
+ while (caplen > 0) {
+ u32 = read_config(&sel, capptr, 4);
+ pci_set_cfgdata32(pi, capptr, u32);
+ caplen -= 4;
+ capptr += 4;
+ }
+ } else if (cap == PCIY_MSIX) {
+ /*
+ * Copy the MSI-X capability
+ */
+ sc->psc_msix.capoff = ptr;
+ caplen = 12;
+ msixcap_ptr = (uint32_t*) &msixcap;
+ capptr = ptr;
+ while (caplen > 0) {
+ u32 = read_config(&sel, capptr, 4);
+ *msixcap_ptr = u32;
+ pci_set_cfgdata32(pi, capptr, u32);
+ caplen -= 4;
+ capptr += 4;
+ msixcap_ptr++;
+ }
+ }
+ ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
+ }
+ }
+
+ if (sc->psc_msix.capoff != 0) {
+ pi->pi_msix.pba_bar =
+ msixcap.pba_offset & MSIX_TABLE_BIR_MASK;
+ pi->pi_msix.pba_offset =
+ msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK;
+ pi->pi_msix.table_bar =
+ msixcap.table_offset & MSIX_TABLE_BIR_MASK;
+ pi->pi_msix.table_offset =
+ msixcap.table_offset & MSIX_TABLE_OFFSET_MASK;
+ pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If the passthrough device does not support MSI then craft a
+ * MSI capability for it. We link the new MSI capability at the
+ * head of the list of capabilities.
+ */
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
+ int origptr, msiptr;
+ origptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ msiptr = passthru_add_msicap(pi, 1, origptr);
+ sc->psc_msi.capoff = msiptr;
+ sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
+ sc->psc_msi.emulated = 1;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
+ }
+#endif
+
+ /* Make sure one of the capabilities is present */
+ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
+ return (-1);
+ else
+ return (0);
+}
+
+static uint64_t
+msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
+{
+ struct pci_devinst *pi;
+ struct msix_table_entry *entry;
+ uint8_t *src8;
+ uint16_t *src16;
+ uint32_t *src32;
+ uint64_t *src64;
+ uint64_t data;
+ size_t entry_offset;
+ int index;
+
+ pi = sc->psc_pi;
+ entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ entry = &pi->pi_msix.table[index];
+
+ switch(size) {
+ case 1:
+ src8 = (uint8_t *)((void *)entry + entry_offset);
+ data = *src8;
+ break;
+ case 2:
+ src16 = (uint16_t *)((void *)entry + entry_offset);
+ data = *src16;
+ break;
+ case 4:
+ src32 = (uint32_t *)((void *)entry + entry_offset);
+ data = *src32;
+ break;
+ case 8:
+ src64 = (uint64_t *)((void *)entry + entry_offset);
+ data = *src64;
+ break;
+ default:
+ return (-1);
+ }
+
+ return (data);
+}
+
+static void
+msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
+ uint64_t offset, int size, uint64_t data)
+{
+ struct pci_devinst *pi;
+ struct msix_table_entry *entry;
+ uint32_t *dest;
+ size_t entry_offset;
+ uint32_t vector_control;
+ int error, index;
+
+ pi = sc->psc_pi;
+ entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+ index = offset / MSIX_TABLE_ENTRY_SIZE;
+ entry = &pi->pi_msix.table[index];
+
+ /* Only 4 byte naturally-aligned writes are supported */
+ assert(size == 4);
+ assert(entry_offset % 4 == 0);
+
+ vector_control = entry->vector_control;
+ dest = (uint32_t *)((void *)entry + entry_offset);
+ *dest = data;
+ /* If MSI-X hasn't been enabled, do nothing */
+ if (pi->pi_msix.enabled) {
+ /* If the entry is masked, don't set it up */
+ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
+ (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev,
+ sc->psc_sel.pc_func,
+ index, entry->msg_data,
+ entry->vector_control,
+ entry->addr);
+ }
+ }
+}
+
+static int
+init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
+{
+ int idx;
+ size_t table_size;
+ vm_paddr_t start;
+ size_t len;
+ struct pci_devinst *pi = sc->psc_pi;
+
+ /*
+ * If the MSI-X table BAR maps memory intended for
+ * other uses, it is at least assured that the table
+ * either resides in its own page within the region,
+ * or it resides in a page shared with only the PBA.
+ */
+ if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar &&
+ ((pi->pi_msix.pba_offset - pi->pi_msix.table_offset) < 4096)) {
+ /* Need to also emulate the PBA, not supported yet */
+ printf("Unsupported MSI-X table and PBA in same page\n");
+ return (-1);
+ }
+
+ /*
+ * May need to split the BAR into 3 regions:
+ * Before the MSI-X table, the MSI-X table, and after it
+ * XXX for now, assume that the table is not in the middle
+ */
+ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table_size = table_size;
+ idx = pi->pi_msix.table_bar;
+
+ /* Round up to page size */
+ table_size = (table_size + 0x1000) & ~0xFFF;
+ if (pi->pi_msix.table_offset == 0) {
+ /* Map everything after the MSI-X table */
+ start = pi->pi_bar[idx].addr + table_size;
+ len = pi->pi_bar[idx].size - table_size;
+ } else {
+ /* Map everything before the MSI-X table */
+ start = pi->pi_bar[idx].addr;
+ len = pi->pi_msix.table_offset;
+ }
+ return (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+ start, len, base + table_size));
+}
+
+static int
+cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
+{
+ int i, error;
+ struct pci_devinst *pi;
+ struct pci_bar_io bar;
+ enum pcibar_type bartype;
+ uint64_t base;
+
+ pi = sc->psc_pi;
+
+ /*
+ * Initialize BAR registers
+ */
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ bzero(&bar, sizeof(bar));
+ bar.pbi_sel = sc->psc_sel;
+ bar.pbi_reg = PCIR_BAR(i);
+
+ if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
+ continue;
+
+ if (PCI_BAR_IO(bar.pbi_base)) {
+ bartype = PCIBAR_IO;
+ base = bar.pbi_base & PCIM_BAR_IO_BASE;
+ } else {
+ switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
+ case PCIM_BAR_MEM_64:
+ bartype = PCIBAR_MEM64;
+ break;
+ default:
+ bartype = PCIBAR_MEM32;
+ break;
+ }
+ base = bar.pbi_base & PCIM_BAR_MEM_BASE;
+ }
+
+ /* Cache information about the "real" BAR */
+ sc->psc_bar[i].type = bartype;
+ sc->psc_bar[i].size = bar.pbi_length;
+ sc->psc_bar[i].addr = base;
+
+ /* Allocate the BAR in the guest I/O or MMIO space */
+ error = pci_emul_alloc_pbar(pi, i, base, bartype,
+ bar.pbi_length);
+ if (error)
+ return (-1);
+
+ /* The MSI-X table needs special handling */
+ if (i == pi->pi_msix.table_bar) {
+ error = init_msix_table(ctx, sc, base);
+ if (error)
+ return (-1);
+ } else if (bartype != PCIBAR_IO) {
+ /* Map the physical MMIO space in the guest MMIO space */
+ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+ pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+ if (error)
+ return (-1);
+ }
+
+ /*
+ * 64-bit BAR takes up two slots so skip the next one.
+ */
+ if (bartype == PCIBAR_MEM64) {
+ i++;
+ assert(i <= PCI_BARMAX);
+ sc->psc_bar[i].type = PCIBAR_MEMHI64;
+ }
+ }
+ return (0);
+}
+
+static int
+cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
+{
+ int error;
+ struct passthru_softc *sc;
+
+ error = 1;
+ sc = pi->pi_arg;
+
+ bzero(&sc->psc_sel, sizeof(struct pcisel));
+ sc->psc_sel.pc_bus = bus;
+ sc->psc_sel.pc_dev = slot;
+ sc->psc_sel.pc_func = func;
+
+ if (cfginitmsi(sc) != 0)
+ goto done;
+
+ if (cfginitbar(ctx, sc) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ return (error);
+}
+
+static int
+passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int bus, slot, func, error;
+ struct passthru_softc *sc;
+
+ sc = NULL;
+ error = 1;
+
+ if (pcifd < 0) {
+ pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
+ if (pcifd < 0)
+ goto done;
+ }
+
+ if (iofd < 0) {
+ iofd = open(_PATH_DEVIO, O_RDWR, 0);
+ if (iofd < 0)
+ goto done;
+ }
+
+ if (opts == NULL ||
+ sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
+ goto done;
+
+ if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
+ goto done;
+
+ sc = malloc(sizeof(struct passthru_softc));
+ memset(sc, 0, sizeof(struct passthru_softc));
+
+ pi->pi_arg = sc;
+ sc->psc_pi = pi;
+
+ /* initialize config space */
+ if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ if (error) {
+ free(sc);
+ vm_unassign_pptdev(ctx, bus, slot, func);
+ }
+ return (error);
+}
+
+static int
+bar_access(int coff)
+{
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msicap_access(struct passthru_softc *sc, int coff)
+{
+ int caplen;
+
+ if (sc->psc_msi.capoff == 0)
+ return (0);
+
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+
+ if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msixcap_access(struct passthru_softc *sc, int coff)
+{
+ if (sc->psc_msix.capoff == 0)
+ return (0);
+
+ return (coff >= sc->psc_msix.capoff &&
+ coff < sc->psc_msix.capoff + MSIX_CAPLEN);
+}
+
+static int
+passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t *rv)
+{
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs and MSI capability is emulated.
+ */
+ if (bar_access(coff) || msicap_access(sc, coff))
+ return (-1);
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * Emulate PCIR_CAP_PTR if this device does not support MSI capability
+ * natively.
+ */
+ if (sc->psc_msi.emulated) {
+ if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
+ return (-1);
+ }
+#endif
+
+ /* Everything else just read from the device's config space */
+ *rv = read_config(&sc->psc_sel, coff, bytes);
+
+ return (0);
+}
+
+static int
+passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t val)
+{
+ int error, msix_table_entries, i;
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs are emulated
+ */
+ if (bar_access(coff))
+ return (-1);
+
+ /*
+ * MSI capability is emulated
+ */
+ if (msicap_access(sc, coff)) {
+ msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
+
+ error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
+ pi->pi_msi.vector, pi->pi_msi.msgnum);
+ if (error != 0) {
+ printf("vm_setup_msi returned error %d\r\n", errno);
+ exit(1);
+ }
+ return (0);
+ }
+
+ if (msixcap_access(sc, coff)) {
+ msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
+ if (pi->pi_msix.enabled) {
+ msix_table_entries = pi->pi_msix.table_count;
+ for (i = 0; i < msix_table_entries; i++) {
+ error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev,
+ sc->psc_sel.pc_func, i,
+ pi->pi_msix.table[i].msg_data,
+ pi->pi_msix.table[i].vector_control,
+ pi->pi_msix.table[i].addr);
+
+ if (error) {
+ printf("vm_setup_msix returned error %d\r\n", errno);
+ exit(1);
+ }
+ }
+ }
+ return (0);
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If this device does not support MSI natively then we cannot let
+ * the guest disable legacy interrupts from the device. It is the
+ * legacy interrupt that is triggering the virtual MSI to the guest.
+ */
+ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
+ if (coff == PCIR_COMMAND && bytes == 2)
+ val &= ~PCIM_CMD_INTxDIS;
+ }
+#endif
+
+ write_config(&sc->psc_sel, coff, bytes, val);
+
+ return (0);
+}
+
+static void
+passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+
+ sc = pi->pi_arg;
+
+ if (pi->pi_msix.table_bar == baridx) {
+ msix_table_write(ctx, vcpu, sc, offset, size, value);
+ } else {
+ assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_WRITE;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = value;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+ }
+}
+
+static uint64_t
+passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+ uint64_t val;
+
+ sc = pi->pi_arg;
+
+ if (pi->pi_msix.table_bar == baridx) {
+ val = msix_table_read(sc, offset, size);
+ } else {
+ assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_READ;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = 0;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+
+ val = pio.val;
+ }
+
+ return (val);
+}
+
+struct pci_devemu passthru = {
+ .pe_emu = "passthru",
+ .pe_init = passthru_init,
+ .pe_cfgwrite = passthru_cfgwrite,
+ .pe_cfgread = passthru_cfgread,
+ .pe_barwrite = passthru_write,
+ .pe_barread = passthru_read,
+};
+PCI_EMUL_SET(passthru);
diff --git a/usr.sbin/bhyve/pci_uart.c b/usr.sbin/bhyve/pci_uart.c
new file mode 100644
index 0000000..dd30551
--- /dev/null
+++ b/usr.sbin/bhyve/pci_uart.c
@@ -0,0 +1,626 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+#include <dev/ic/ns16550.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+
+#define COM1_BASE 0x3F8
+#define COM1_IRQ 4
+#define COM2_BASE 0x2F8
+#define COM2_IRQ 3
+
+#define DEFAULT_RCLK 1843200
+#define DEFAULT_BAUD 9600
+
+#define FCR_RX_MASK 0xC0
+
+#define MCR_OUT1 0x04
+#define MCR_OUT2 0x08
+
+#define MSR_DELTA_MASK 0x0f
+
+#ifndef REG_SCR
+#define REG_SCR com_scr
+#endif
+
+#define FIFOSZ 16
+
+/*
+ * Pick a PCI vid/did of a chip with a single uart at
+ * BAR0, that most versions of FreeBSD can understand:
+ * Siig CyberSerial 1-port.
+ */
+#define COM_VENDOR 0x131f
+#define COM_DEV 0x2000
+
+static int pci_uart_stdio; /* stdio in use for i/o */
+
+static int pci_uart_nldevs; /* number of legacy devices - 2 max */
+
+static struct {
+ uint64_t baddr;
+ int vector;
+} pci_uart_lres[] = {
+ { COM1_BASE, COM1_IRQ},
+ { COM2_BASE, COM2_IRQ},
+ { 0, 0 }
+};
+
+struct fifo {
+ uint8_t buf[FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of characters in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct pci_uart_softc {
+ struct pci_devinst *pi;
+ pthread_mutex_t mtx; /* protects all softc elements */
+ uint8_t data; /* Data register (R/W) */
+ uint8_t ier; /* Interrupt enable register (R/W) */
+ uint8_t lcr; /* Line control register (R/W) */
+ uint8_t mcr; /* Modem control register (R/W) */
+ uint8_t lsr; /* Line status register (R/W) */
+ uint8_t msr; /* Modem status register (R/W) */
+ uint8_t fcr; /* FIFO control register (W) */
+ uint8_t scr; /* Scratch register (R/W) */
+
+ uint8_t dll; /* Baudrate divisor latch LSB */
+ uint8_t dlh; /* Baudrate divisor latch MSB */
+
+ struct fifo rxfifo;
+
+ int opened;
+ int stdio;
+ bool thre_int_pending; /* THRE interrupt pending */
+};
+
+static void pci_uart_drain(int fd, enum ev_type ev, void *arg);
+
+static struct termios tio_orig, tio_new; /* I/O Terminals */
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDIN_FILENO, &wb, 1);
+}
+
+static void
+fifo_reset(struct fifo *fifo, int size)
+{
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = size;
+}
+
+static int
+fifo_putchar(struct fifo *fifo, uint8_t ch)
+{
+
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = ch;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ return (0);
+ } else
+ return (-1);
+}
+
+static int
+fifo_getchar(struct fifo *fifo)
+{
+ int c;
+
+ if (fifo->num > 0) {
+ c = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ return (c);
+ } else
+ return (-1);
+}
+
+static int
+fifo_numchars(struct fifo *fifo)
+{
+
+ return (fifo->num);
+}
+
+static int
+fifo_available(struct fifo *fifo)
+{
+
+ return (fifo->num < fifo->size);
+}
+
+static void
+pci_uart_opentty(struct pci_uart_softc *sc)
+{
+ struct mevent *mev;
+
+ assert(sc->opened == 0);
+ assert(sc->stdio);
+
+ ttyopen();
+ mev = mevent_add(STDIN_FILENO, EVF_READ, pci_uart_drain, sc);
+ assert(mev);
+}
+
+static void
+pci_uart_legacy_res(uint64_t *bar, int *ivec)
+{
+ if (pci_uart_lres[pci_uart_nldevs].baddr != 0) {
+ *bar = pci_uart_lres[pci_uart_nldevs].baddr;
+ *ivec = pci_uart_lres[pci_uart_nldevs].vector;
+ pci_uart_nldevs++;
+ } else {
+ /* TODO: print warning ? */
+ *bar = 0;
+ *ivec= -1;
+ }
+}
+
+/*
+ * The IIR returns a prioritized interrupt reason:
+ * - receive data available
+ * - transmit holding register empty
+ * - modem status change
+ *
+ * Return an interrupt reason if one is available.
+ */
+static int
+pci_uart_intr_reason(struct pci_uart_softc *sc)
+{
+
+ if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
+ return (IIR_RLS);
+ else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0)
+ return (IIR_RXTOUT);
+ else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
+ return (IIR_TXRDY);
+ else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
+ return (IIR_MLSC);
+ else
+ return (IIR_NOPEND);
+}
+
+static void
+pci_uart_reset(struct pci_uart_softc *sc)
+{
+ uint16_t divisor;
+
+ divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
+ sc->dll = divisor;
+ sc->dlh = divisor >> 16;
+
+ fifo_reset(&sc->rxfifo, 1); /* no fifo until enabled by software */
+}
+
+/*
+ * Toggle the COM port's intr pin depending on whether or not we have an
+ * interrupt condition to report to the processor.
+ */
+static void
+pci_uart_toggle_intr(struct pci_uart_softc *sc)
+{
+ uint8_t intr_reason;
+
+ intr_reason = pci_uart_intr_reason(sc);
+
+ if (intr_reason == IIR_NOPEND)
+ pci_lintr_deassert(sc->pi);
+ else
+ pci_lintr_assert(sc->pi);
+}
+
+static void
+pci_uart_drain(int fd, enum ev_type ev, void *arg)
+{
+ struct pci_uart_softc *sc;
+ int ch;
+
+ sc = arg;
+
+ assert(fd == STDIN_FILENO);
+ assert(ev == EVF_READ);
+
+ /*
+ * This routine is called in the context of the mevent thread
+ * to take out the softc lock to protect against concurrent
+ * access from a vCPU i/o exit
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if ((sc->mcr & MCR_LOOPBACK) != 0) {
+ (void) ttyread();
+ } else {
+ while (fifo_available(&sc->rxfifo) &&
+ ((ch = ttyread()) != -1)) {
+ fifo_putchar(&sc->rxfifo, ch);
+ }
+ pci_uart_toggle_intr(sc);
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_uart_softc *sc;
+ int fifosz;
+ uint8_t msr;
+
+ sc = pi->pi_arg;
+
+ assert(baridx == 0);
+ assert(size == 1);
+
+ /* Open terminal */
+ if (!sc->opened && sc->stdio) {
+ pci_uart_opentty(sc);
+ sc->opened = 1;
+ }
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ sc->dll = value;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ sc->dlh = value;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ if (sc->mcr & MCR_LOOPBACK) {
+ if (fifo_putchar(&sc->rxfifo, value) != 0)
+ sc->lsr |= LSR_OE;
+ } else if (sc->stdio) {
+ ttywrite(value);
+ } /* else drop on floor */
+ sc->thre_int_pending = true;
+ break;
+ case REG_IER:
+ /*
+ * Apply mask so that bits 4-7 are 0
+ * Also enables bits 0-3 only if they're 1
+ */
+ sc->ier = value & 0x0F;
+ break;
+ case REG_FCR:
+ /*
+ * When moving from FIFO and 16450 mode and vice versa,
+ * the FIFO contents are reset.
+ */
+ if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
+ fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
+ fifo_reset(&sc->rxfifo, fifosz);
+ }
+
+ /*
+ * The FCR_ENABLE bit must be '1' for the programming
+ * of other FCR bits to be effective.
+ */
+ if ((value & FCR_ENABLE) == 0) {
+ sc->fcr = 0;
+ } else {
+ if ((value & FCR_RCV_RST) != 0)
+ fifo_reset(&sc->rxfifo, FIFOSZ);
+
+ sc->fcr = value &
+ (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
+ }
+ break;
+ case REG_LCR:
+ sc->lcr = value;
+ break;
+ case REG_MCR:
+ /* Apply mask so that bits 5-7 are 0 */
+ sc->mcr = value & 0x1F;
+
+ msr = 0;
+ if (sc->mcr & MCR_LOOPBACK) {
+ /*
+ * In the loopback mode certain bits from the
+ * MCR are reflected back into MSR
+ */
+ if (sc->mcr & MCR_RTS)
+ msr |= MSR_CTS;
+ if (sc->mcr & MCR_DTR)
+ msr |= MSR_DSR;
+ if (sc->mcr & MCR_OUT1)
+ msr |= MSR_RI;
+ if (sc->mcr & MCR_OUT2)
+ msr |= MSR_DCD;
+ }
+
+ /*
+ * Detect if there has been any change between the
+ * previous and the new value of MSR. If there is
+ * then assert the appropriate MSR delta bit.
+ */
+ if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
+ sc->msr |= MSR_DCTS;
+ if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
+ sc->msr |= MSR_DDSR;
+ if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
+ sc->msr |= MSR_DDCD;
+ if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
+ sc->msr |= MSR_TERI;
+
+ /*
+ * Update the value of MSR while retaining the delta
+ * bits.
+ */
+ sc->msr &= MSR_DELTA_MASK;
+ sc->msr |= msr;
+ break;
+ case REG_LSR:
+ /*
+ * Line status register is not meant to be written to
+ * during normal operation.
+ */
+ break;
+ case REG_MSR:
+ /*
+ * As far as I can tell MSR is a read-only register.
+ */
+ break;
+ case REG_SCR:
+ sc->scr = value;
+ break;
+ default:
+ break;
+ }
+
+done:
+ pci_uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+uint64_t
+pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_uart_softc *sc;
+ uint8_t iir, intr_reason;
+ uint64_t reg;
+
+ sc = pi->pi_arg;
+
+ assert(baridx == 0);
+ assert(size == 1);
+
+ /* Open terminal */
+ if (!sc->opened && sc->stdio) {
+ pci_uart_opentty(sc);
+ sc->opened = 1;
+ }
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ reg = sc->dll;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ reg = sc->dlh;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ reg = fifo_getchar(&sc->rxfifo);
+ break;
+ case REG_IER:
+ reg = sc->ier;
+ break;
+ case REG_IIR:
+ iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
+
+ intr_reason = pci_uart_intr_reason(sc);
+
+ /*
+ * Deal with side effects of reading the IIR register
+ */
+ if (intr_reason == IIR_TXRDY)
+ sc->thre_int_pending = false;
+
+ iir |= intr_reason;
+
+ reg = iir;
+ break;
+ case REG_LCR:
+ reg = sc->lcr;
+ break;
+ case REG_MCR:
+ reg = sc->mcr;
+ break;
+ case REG_LSR:
+ /* Transmitter is always ready for more data */
+ sc->lsr |= LSR_TEMT | LSR_THRE;
+
+ /* Check for new receive data */
+ if (fifo_numchars(&sc->rxfifo) > 0)
+ sc->lsr |= LSR_RXRDY;
+ else
+ sc->lsr &= ~LSR_RXRDY;
+
+ reg = sc->lsr;
+
+ /* The LSR_OE bit is cleared on LSR read */
+ sc->lsr &= ~LSR_OE;
+ break;
+ case REG_MSR:
+ /*
+ * MSR delta bits are cleared on read
+ */
+ reg = sc->msr;
+ sc->msr &= ~MSR_DELTA_MASK;
+ break;
+ case REG_SCR:
+ reg = sc->scr;
+ break;
+ default:
+ reg = 0xFF;
+ break;
+ }
+
+done:
+ pci_uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (reg);
+}
+
+static int
+pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct pci_uart_softc *sc;
+ uint64_t bar;
+ int ivec;
+
+ sc = malloc(sizeof(struct pci_uart_softc));
+ memset(sc, 0, sizeof(struct pci_uart_softc));
+
+ pi->pi_arg = sc;
+ sc->pi = pi;
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
+ if (pci_is_legacy(pi)) {
+ pci_uart_legacy_res(&bar, &ivec);
+ pci_emul_alloc_pbar(pi, 0, bar, PCIBAR_IO, 8);
+ } else {
+ ivec = -1;
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, 8);
+ }
+ pci_lintr_request(pi, ivec);
+
+ if (opts != NULL && !strcmp("stdio", opts) && !pci_uart_stdio) {
+ pci_uart_stdio = 1;
+ sc->stdio = 1;
+ }
+
+ pci_uart_reset(sc);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_com = {
+ .pe_emu = "uart",
+ .pe_init = pci_uart_init,
+ .pe_barwrite = pci_uart_write,
+ .pe_barread = pci_uart_read
+};
+PCI_EMUL_SET(pci_de_com);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
new file mode 100644
index 0000000..3382097
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -0,0 +1,534 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTBLK_RINGSZ 64
+
+#define VTBLK_CFGSZ 28
+
+#define VTBLK_R_CFG VTCFG_R_CFG0
+#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1
+#define VTBLK_R_MAX VTBLK_R_CFG_END
+
+#define VTBLK_REGSZ VTBLK_R_MAX+1
+
+#define VTBLK_MAXSEGS 32
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( 0x00000004 | /* host maximum request segments */ \
+ 0x10000000 ) /* supports indirect descriptors */
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Config space
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ uint16_t vbc_geom_c;
+ uint8_t vbc_geom_h;
+ uint8_t vbc_geom_s;
+ uint32_t vbc_blk_size;
+ uint32_t vbc_sectors_max;
+} __packed;
+CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+ struct pci_devinst *vbsc_pi;
+ int vbsc_fd;
+ int vbsc_status;
+ int vbsc_isr;
+ int vbsc_lastq;
+ uint32_t vbsc_features;
+ uint64_t vbsc_pfn;
+ struct vring_hqueue vbsc_q;
+ struct vtblk_config vbsc_cfg;
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static void
+pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
+{
+ if (value == 0) {
+ DPRINTF(("vtblk: device reset requested !\n"));
+ }
+
+ sc->vbsc_status = value;
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTBLK_MAXSEGS];
+ struct virtio_blk_hdr *vbh;
+ struct virtio_desc *vd, *vid;
+ struct virtio_used *vu;
+ uint8_t *status;
+ int i;
+ int err;
+ int iolen;
+ int nsegs;
+ int uidx, aidx, didx;
+ int writeop;
+ off_t offset;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Verify that the descriptor is indirect, and obtain
+ * the pointer to the indirect descriptor.
+ * There has to be space for at least 3 descriptors
+ * in the indirect descriptor array: the block header,
+ * 1 or more data descriptors, and a status byte.
+ */
+ assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
+
+ nsegs = vd->vd_len / sizeof(struct virtio_desc);
+ assert(nsegs >= 3);
+ assert(nsegs < VTBLK_MAXSEGS + 2);
+
+ vid = paddr_guest2host(vd->vd_addr);
+ assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * The first descriptor will be the read-only fixed header
+ */
+ vbh = paddr_guest2host(vid[0].vd_addr);
+ assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
+ assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
+
+ writeop = (vbh->vbh_type == VBH_OP_WRITE);
+
+ offset = vbh->vbh_sector * DEV_BSIZE;
+
+ /*
+ * Build up the iovec based on the guest's data descriptors
+ */
+ for (i = 1, iolen = 0; i < nsegs - 1; i++) {
+ iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
+ iov[i-1].iov_len = vid[i].vd_len;
+ iolen += vid[i].vd_len;
+
+ assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * - write op implies read-only descriptor,
+ * - read op implies write-only descriptor,
+ * therefore test the inverse of the descriptor bit
+ * to the op.
+ */
+ assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
+ writeop);
+ }
+
+ /* Lastly, get the address of the status byte */
+ status = paddr_guest2host(vid[nsegs - 1].vd_addr);
+ assert(vid[nsegs - 1].vd_len == 1);
+ assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
+ assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
+
+ DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
+ writeop ? "write" : "read", iolen, nsegs - 2, offset));
+
+ if (writeop){
+ err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
+ } else {
+ err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
+ }
+
+ *status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
+
+ /*
+ * Return the single indirect descriptor back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = 1;
+ hq->hq_cur_aidx++;
+ *hq->hq_used_idx += 1;
+}
+
+static void
+pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vbsc_q;
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtblk_proc(sc, hq);
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
+ sc->vbsc_isr == 0) {
+ sc->vbsc_isr = 1;
+ pci_generate_msi(sc->vbsc_pi, 0);
+ }
+
+}
+
+static void
+pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+
+ sc->vbsc_pfn = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vbsc_q;
+ hq->hq_size = VTBLK_RINGSZ;
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct stat sbuf;
+ struct pci_vtblk_softc *sc;
+ off_t size;
+ int fd;
+ int sectsz;
+
+ if (opts == NULL) {
+ printf("virtio-block: backing device required\n");
+ return (1);
+ }
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ /*
+ * The supplied backing file has to exist
+ */
+ fd = open(opts, O_RDWR);
+ if (fd < 0) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ perror("Could not stat backing file");
+ close(fd);
+ return (1);
+ }
+
+ /*
+ * Deal with raw devices
+ */
+ size = sbuf.st_size;
+ sectsz = DEV_BSIZE;
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+ ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+ perror("Could not fetch dev blk/sector size");
+ close(fd);
+ return (1);
+ }
+ assert(size != 0);
+ assert(sectsz != 0);
+ }
+
+ sc = malloc(sizeof(struct pci_vtblk_softc));
+ memset(sc, 0, sizeof(struct pci_vtblk_softc));
+
+ pi->pi_arg = sc;
+ sc->vbsc_pi = pi;
+ sc->vbsc_fd = fd;
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg.vbc_capacity = size / sectsz;
+ sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
+ sc->vbsc_cfg.vbc_blk_size = sectsz;
+ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
+ sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
+ sc->vbsc_cfg.vbc_geom_h = 0;
+ sc->vbsc_cfg.vbc_geom_s = 0;
+ sc->vbsc_cfg.vbc_sectors_max = 0;
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+ pci_emul_add_msicap(pi, 1);
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTBLK_REGSZ);
+
+ return (0);
+}
+
+static void
+pci_vtblk_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_write: 2big, offset %ld size %d\n",
+ offset, size));
+ return;
+ }
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtblk_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ sc->vbsc_lastq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value == 0);
+ pci_vtblk_qnotify(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtblk_update_status(sc, value);
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ DPRINTF(("vtblk: write to readonly reg %ld\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o write offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+}
+
+uint64_t
+pci_vtblk_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+ void *ptr;
+ uint32_t value;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_read: 2big, offset %ld size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vbsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vbsc_pfn >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vbsc_lastq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = 0; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vbsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vbsc_isr;
+ sc->vbsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ assert(size + offset <= (VTBLK_R_CFG_END + 1));
+ ptr = (uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG;
+ if (size == 1) {
+ value = *(uint8_t *) ptr;
+ } else if (size == 2) {
+ value = *(uint16_t *) ptr;
+ } else {
+ value = *(uint32_t *) ptr;
+ }
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o read offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vblk = {
+ .pe_emu = "virtio-blk",
+ .pe_init = pci_vtblk_init,
+ .pe_barwrite = pci_vtblk_write,
+ .pe_barread = pci_vtblk_read
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
new file mode 100644
index 0000000..3f6f88a
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -0,0 +1,781 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+#include "virtio.h"
+
+#define VTNET_RINGSZ 256
+
+#define VTNET_MAXSEGS 32
+
+/*
+ * PCI config-space register offsets
+ */
+#define VTNET_R_CFG0 20
+#define VTNET_R_CFG1 21
+#define VTNET_R_CFG2 22
+#define VTNET_R_CFG3 23
+#define VTNET_R_CFG4 24
+#define VTNET_R_CFG5 25
+#define VTNET_R_CFG6 26
+#define VTNET_R_CFG7 27
+#define VTNET_R_MAX 27
+
+#define VTNET_REGSZ VTNET_R_MAX+1
+
+/*
+ * Host capabilities
+ */
+#define VTNET_S_HOSTCAPS \
+ ( 0x00000020 | /* host supplies MAC */ \
+ 0x00008000 | /* host can merge Rx buffers */ \
+ 0x00010000 ) /* config status available */
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ 0
+#define VTNET_TXQ 1
+#define VTNET_CTLQ 2
+
+#define VTNET_MAXQ 3
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+ struct pci_devinst *vsc_pi;
+ pthread_mutex_t vsc_mtx;
+ struct mevent *vsc_mevp;
+
+ int vsc_curq;
+ int vsc_status;
+ int vsc_isr;
+ int vsc_tapfd;
+ int vsc_rx_ready;
+ int vsc_rxpend;
+
+ uint32_t vsc_features;
+ uint8_t vsc_macaddr[6];
+
+ uint64_t vsc_pfn[VTNET_MAXQ];
+ struct vring_hqueue vsc_hq[VTNET_MAXQ];
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static uint16_t
+pci_vtnet_qsize(int qnum)
+{
+ /* XXX no ctl queue currently */
+ if (qnum == VTNET_CTLQ) {
+ return (0);
+ }
+
+ /* XXX fixed currently. Maybe different for tx/rx/ctl */
+ return (VTNET_RINGSZ);
+}
+
+static void
+pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
+{
+ struct vring_hqueue *hq;
+
+ assert(ring < VTNET_MAXQ);
+
+ hq = &sc->vsc_hq[ring];
+
+ /*
+ * Reset all soft state
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static void
+pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
+{
+
+ if (value == 0) {
+ DPRINTF(("vtnet: device reset requested !\n"));
+ pci_vtnet_ring_reset(sc, VTNET_RXQ);
+ pci_vtnet_ring_reset(sc, VTNET_TXQ);
+ sc->vsc_rx_ready = 0;
+ }
+
+ sc->vsc_status = value;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ char pad[60];
+
+ if (sc->vsc_tapfd == -1)
+ return;
+
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ memset(pad, 0, 60 - len);
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = 60 - len;
+ iovcnt++;
+ }
+ (void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+
+/*
+ * Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ * MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ struct vring_hqueue *hq;
+ struct virtio_net_rxhdr *vrx;
+ uint8_t *buf;
+ int i;
+ int len;
+ int ndescs;
+ int didx, uidx, aidx; /* descriptor, avail and used index */
+
+ /*
+ * Should never be called without a valid tap fd
+ */
+ assert(sc->vsc_tapfd != -1);
+
+ /*
+ * But, will be called when the rx ring hasn't yet
+ * been set up.
+ */
+ if (sc->vsc_rx_ready == 0) {
+ /*
+ * Drop the packet and try later.
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ /*
+ * Calculate the number of available rx buffers
+ */
+ hq = &sc->vsc_hq[VTNET_RXQ];
+
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0) {
+ /*
+ * Need to wait for host notification to read
+ */
+ if (sc->vsc_rxpend == 0) {
+ WPRINTF(("vtnet: no rx descriptors !\n"));
+ sc->vsc_rxpend = 1;
+ }
+
+ /*
+ * Drop the packet and try later
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ aidx = hq->hq_cur_aidx;
+ uidx = *hq->hq_used_idx;
+ for (i = 0; i < ndescs; i++) {
+ /*
+ * 'aidx' indexes into the an array of descriptor indexes
+ */
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
+ buf = (uint8_t *)(vrx + 1);
+
+ len = read(sc->vsc_tapfd, buf,
+ vd->vd_len - sizeof(struct virtio_net_rxhdr));
+
+ if (len < 0 && errno == EWOULDBLOCK) {
+ break;
+ }
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers, which is always 1 without TSO
+ * support.
+ */
+ memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
+ vrx->vrh_bufs = 1;
+
+ /*
+ * Write this descriptor into the used ring
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
+ uidx++;
+ aidx++;
+ }
+
+ /*
+ * Update the used pointer, and signal an interrupt if allowed
+ */
+ *hq->hq_used_idx = uidx;
+ hq->hq_cur_aidx = aidx;
+
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ pci_vtnet_tap_rx(sc);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+}
+
+static void
+pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
+{
+ /*
+ * A qnotify means that the rx process can now begin
+ */
+ if (sc->vsc_rx_ready == 0) {
+ sc->vsc_rx_ready = 1;
+ }
+
+ /*
+ * If the rx queue was empty, attempt to receive a
+ * packet that was previously blocked due to no rx bufs
+ * available
+ */
+ if (sc->vsc_rxpend) {
+ WPRINTF(("vtnet: rx resumed\n\r"));
+ sc->vsc_rxpend = 0;
+ pci_vtnet_tap_rx(sc);
+ }
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTNET_MAXSEGS + 1];
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ int i;
+ int plen;
+ int tlen;
+ int uidx, aidx, didx;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Run through the chain of descriptors, ignoring the
+ * first header descriptor. However, include the header
+ * length in the total length that will be put into the
+ * used queue.
+ */
+ tlen = vd->vd_len;
+ vd = &hq->hq_dtable[vd->vd_next];
+
+ for (i = 0, plen = 0;
+ i < VTNET_MAXSEGS;
+ i++, vd = &hq->hq_dtable[vd->vd_next]) {
+ iov[i].iov_base = paddr_guest2host(vd->vd_addr);
+ iov[i].iov_len = vd->vd_len;
+ plen += vd->vd_len;
+ tlen += vd->vd_len;
+
+ if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ }
+ assert(i < VTNET_MAXSEGS);
+
+ DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
+ pci_vtnet_tap_tx(sc, iov, i + 1, plen);
+
+ /*
+ * Return this chain back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = tlen;
+ hq->hq_cur_aidx = aidx + 1;
+ *hq->hq_used_idx = uidx + 1;
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtnet_proctx(sc, hq);
+}
+
+static void
+pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
+{
+
+ DPRINTF(("vtnet: control qnotify!\n\r"));
+}
+
+static void
+pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+ int qnum = sc->vsc_curq;
+
+ assert(qnum < VTNET_MAXQ);
+
+ sc->vsc_pfn[qnum] = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vsc_hq[qnum];
+ hq->hq_size = pci_vtnet_qsize(qnum);
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ MD5_CTX mdctx;
+ unsigned char digest[16];
+ char nstr[80];
+ struct pci_vtnet_softc *sc;
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ sc = malloc(sizeof(struct pci_vtnet_softc));
+ memset(sc, 0, sizeof(struct pci_vtnet_softc));
+
+ pi->pi_arg = sc;
+ sc->vsc_pi = pi;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /*
+ * Attempt to open the tap device
+ */
+ sc->vsc_tapfd = -1;
+ if (opts != NULL) {
+ char tbuf[80];
+
+ strcpy(tbuf, "/dev/");
+ strlcat(tbuf, opts, sizeof(tbuf));
+
+ sc->vsc_tapfd = open(tbuf, O_RDWR);
+ if (sc->vsc_tapfd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ } else {
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ int opt = 1;
+ if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+
+ sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+ EVF_READ,
+ pci_vtnet_tap_callback,
+ sc);
+ if (sc->vsc_mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+ }
+ }
+
+ /*
+ * The MAC address is the standard NetApp OUI of 00-a0-98,
+ * followed by an MD5 of the vm name. The slot/func number is
+ * prepended to this for slots other than 1:0, so that
+ * a bootloader can netboot from the equivalent of slot 1.
+ */
+ if (pi->pi_slot == 1 && pi->pi_func == 0) {
+ strncpy(nstr, vmname, sizeof(nstr));
+ } else {
+ snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+ pi->pi_func, vmname);
+ }
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, nstr, strlen(nstr));
+ MD5Final(digest, &mdctx);
+
+ sc->vsc_macaddr[0] = 0x00;
+ sc->vsc_macaddr[1] = 0xa0;
+ sc->vsc_macaddr[2] = 0x98;
+ sc->vsc_macaddr[3] = digest[0];
+ sc->vsc_macaddr[4] = digest[1];
+ sc->vsc_macaddr[5] = digest[2];
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+ pci_emul_add_msicap(pi, 1);
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
+
+ return (0);
+}
+
+/*
+ * Function pointer array to handle queue notifications
+ */
+static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
+ pci_vtnet_ping_rxq,
+ pci_vtnet_ping_txq,
+ pci_vtnet_ping_ctlq
+};
+
+static void
+pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+ void *ptr;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
+ offset, size));
+ return;
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vsc_features = value & VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtnet_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ sc->vsc_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ (*pci_vtnet_qnotify[value])(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtnet_update_status(sc, value);
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ assert((size + offset) <= (VTNET_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
+ if (size == 1) {
+ *(uint8_t *) ptr = value;
+ } else if (size == 2) {
+ *(uint16_t *) ptr = value;
+ } else {
+ *(uint32_t *) ptr = value;
+ }
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTNET_R_CFG6:
+ case VTNET_R_CFG7:
+ DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+uint64_t
+pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+ void *ptr;
+ uint64_t value;
+
+ assert(baridx == 0);
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ assert(size == 2);
+ value = pci_vtnet_qsize(sc->vsc_curq);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vsc_isr;
+ sc->vsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ assert((size + offset) <= (VTNET_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+ if (size == 1) {
+ value = *(uint8_t *) ptr;
+ } else if (size == 2) {
+ value = *(uint16_t *) ptr;
+ } else {
+ value = *(uint32_t *) ptr;
+ }
+ break;
+ case VTNET_R_CFG6:
+ assert(size != 4);
+ value = 0x01; /* XXX link always up */
+ break;
+ case VTNET_R_CFG7:
+ assert(size == 1);
+ value = 0; /* XXX link status in LSB */
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vnet = {
+ .pe_emu = "virtio-net",
+ .pe_init = pci_vtnet_init,
+ .pe_barwrite = pci_vtnet_write,
+ .pe_barread = pci_vtnet_read
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pit_8254.c b/usr.sbin/bhyve/pit_8254.c
new file mode 100644
index 0000000..c96596a
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.c
@@ -0,0 +1,198 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/time.h>
+
+#include <machine/clock.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "pit_8254.h"
+
+#define TIMER_SEL_MASK 0xc0
+#define TIMER_RW_MASK 0x30
+#define TIMER_MODE_MASK 0x0f
+#define TIMER_SEL_READBACK 0xc0
+
+#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
+
+#define PIT_8254_FREQ 1193182
+static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
+
+struct counter {
+ struct timeval tv; /* uptime when counter was loaded */
+ uint16_t initial; /* initial counter value */
+ uint8_t cr[2];
+ uint8_t ol[2];
+ int crbyte;
+ int olbyte;
+};
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static void
+latch(struct counter *c)
+{
+ struct timeval tv2;
+ uint16_t lval;
+ uint64_t delta_nsecs, delta_ticks;
+
+ /* cannot latch a new value until the old one has been consumed */
+ if (c->olbyte != 0)
+ return;
+
+ if (c->initial == 0 || c->initial == 1) {
+ /*
+ * XXX the program that runs the VM can be stopped and
+ * restarted at any time. This means that state that was
+ * created by the guest is destroyed between invocations
+ * of the program.
+ *
+ * If the counter's initial value is not programmed we
+ * assume a value that would be set to generate 'guest_hz'
+ * interrupts per second.
+ */
+ c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
+ gettimeofday(&c->tv, NULL);
+ }
+
+ (void)gettimeofday(&tv2, NULL);
+ timevalsub(&tv2, &c->tv);
+ delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
+ delta_ticks = delta_nsecs / nsecs_per_tick;
+
+ lval = c->initial - delta_ticks % c->initial;
+ c->olbyte = 2;
+ c->ol[1] = lval; /* LSB */
+ c->ol[0] = lval >> 8; /* MSB */
+}
+
+static int
+pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int sel, rw, mode;
+ uint8_t val;
+ struct counter *c;
+
+ static struct counter counter[3];
+
+ if (bytes != 1)
+ return (-1);
+
+ val = *eax;
+
+ if (port == TIMER_MODE) {
+ assert(in == 0);
+ sel = val & TIMER_SEL_MASK;
+ rw = val & TIMER_RW_MASK;
+ mode = val & TIMER_MODE_MASK;
+
+ if (sel == TIMER_SEL_READBACK)
+ return (-1);
+ if (rw != TIMER_LATCH && rw != TIMER_16BIT)
+ return (-1);
+
+ if (rw != TIMER_LATCH) {
+ /*
+ * Counter mode is not affected when issuing a
+ * latch command.
+ */
+ if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
+ return (-1);
+ }
+
+ c = &counter[sel >> 6];
+ if (rw == TIMER_LATCH)
+ latch(c);
+ else
+ c->olbyte = 0; /* reset latch after reprogramming */
+
+ return (0);
+ }
+
+ /* counter ports */
+ assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
+ c = &counter[port - TIMER_CNTR0];
+
+ if (in) {
+ /*
+ * XXX
+ * The spec says that once the output latch is completely
+ * read it should revert to "following" the counter. We don't
+ * do this because it is hard and any reasonable OS should
+ * always latch the counter before trying to read it.
+ */
+ if (c->olbyte == 0)
+ c->olbyte = 2;
+ *eax = c->ol[--c->olbyte];
+ } else {
+ c->cr[c->crbyte++] = *eax;
+ if (c->crbyte == 2) {
+ c->crbyte = 0;
+ c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
+ if (c->initial == 0)
+ c->initial = 0xffff;
+ gettimeofday(&c->tv, NULL);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);
diff --git a/usr.sbin/bhyve/pit_8254.h b/usr.sbin/bhyve/pit_8254.h
new file mode 100644
index 0000000..61bd15d
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PIT_8254_H_
+#define _PIT_8254_H_
+
+/*
+ * Borrowed from amd64/include/timerreg.h because in that file it is
+ * conditionally compiled for #ifdef _KERNEL only.
+ */
+
+#include <dev/ic/i8253reg.h>
+
+#define IO_TIMER1 0x40 /* 8253 Timer #1 */
+#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
+#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
+#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
+#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
+
+#endif /* _PIT_8254_H_ */
diff --git a/usr.sbin/bhyve/pmtmr.c b/usr.sbin/bhyve/pmtmr.c
new file mode 100644
index 0000000..78d14eb
--- /dev/null
+++ b/usr.sbin/bhyve/pmtmr.c
@@ -0,0 +1,108 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <machine/cpufunc.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "inout.h"
+
+/*
+ * The ACPI Power Management timer is a free-running 24- or 32-bit
+ * timer with a frequency of 3.579545MHz
+ *
+ * This implementation will be 32-bits
+ */
+
+#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */
+
+#define PMTMR_FREQ 3579545 /* 3.579545MHz */
+
+static pthread_mutex_t pmtmr_mtx;
+static uint64_t pmtmr_tscf;
+static uint64_t pmtmr_old;
+static uint64_t pmtmr_tsc_old;
+
+static uint32_t
+pmtmr_val(void)
+{
+ uint64_t pmtmr_tsc_new;
+ uint64_t pmtmr_new;
+ static int inited = 0;
+
+ if (!inited) {
+ size_t len;
+ uint32_t tmpf;
+
+ inited = 1;
+ pthread_mutex_init(&pmtmr_mtx, NULL);
+ len = sizeof(tmpf);
+ sysctlbyname("machdep.tsc_freq", &tmpf, &len,
+ NULL, 0);
+ pmtmr_tscf = tmpf;
+ pmtmr_tsc_old = rdtsc();
+ pmtmr_old = pmtmr_tsc_old / pmtmr_tscf * PMTMR_FREQ;
+ return (pmtmr_old);
+ }
+
+ pthread_mutex_lock(&pmtmr_mtx);
+ pmtmr_tsc_new = rdtsc();
+ pmtmr_new = (pmtmr_tsc_new - pmtmr_tsc_old) * PMTMR_FREQ / pmtmr_tscf +
+ pmtmr_old;
+ pmtmr_old = pmtmr_new;
+ pmtmr_tsc_old = pmtmr_tsc_new;
+ pthread_mutex_unlock(&pmtmr_mtx);
+
+ return (pmtmr_new);
+}
+
+static int
+pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 4)
+ return (-1);
+
+ *eax = pmtmr_val();
+
+ return (0);
+}
+
+INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler);
+
diff --git a/usr.sbin/bhyve/post.c b/usr.sbin/bhyve/post.c
new file mode 100644
index 0000000..092a551
--- /dev/null
+++ b/usr.sbin/bhyve/post.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 1)
+ return (-1);
+
+ *eax = 0xff; /* return some garbage */
+ return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
new file mode 100644
index 0000000..f8b894e
--- /dev/null
+++ b/usr.sbin/bhyve/rtc.c
@@ -0,0 +1,274 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define IO_RTC 0x70
+
+#define RTC_SEC 0x00 /* seconds */
+#define RTC_MIN 0x02
+#define RTC_HRS 0x04
+#define RTC_WDAY 0x06
+#define RTC_DAY 0x07
+#define RTC_MONTH 0x08
+#define RTC_YEAR 0x09
+#define RTC_CENTURY 0x32 /* current century */
+
+#define RTC_STATUSA 0xA
+#define RTCSA_TUP 0x80 /* time update, don't look now */
+
+#define RTC_STATUSB 0xB
+#define RTCSB_DST 0x01
+#define RTCSB_24HR 0x02
+#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
+#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
+#define RTCSB_HALT 0x80 /* stop clock updates */
+
+#define RTC_INTR 0x0c /* status register C (R) interrupt source */
+
+#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
+#define RTCSD_PWR 0x80 /* clock power OK */
+
+#define RTC_DIAG 0x0e
+
+#define RTC_RSTCODE 0x0f
+
+#define RTC_EQUIPMENT 0x14
+
+static int addr;
+
+/* XXX initialize these to default values as they would be from BIOS */
+static uint8_t status_a, status_b, rstcode;
+
+static u_char const bin2bcd_data[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+#define bin2bcd(bin) (bin2bcd_data[bin])
+
+#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static int
+rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+
+ if (bytes != 1)
+ return (-1);
+
+ switch (*eax) {
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ case RTC_STATUSA:
+ case RTC_STATUSB:
+ case RTC_INTR:
+ case RTC_STATUSD:
+ case RTC_DIAG:
+ case RTC_RSTCODE:
+ case RTC_EQUIPMENT:
+ break;
+ default:
+ return (-1);
+ }
+
+ addr = *eax;
+ return (0);
+}
+
+static int
+rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int hour;
+ time_t t;
+ struct timeval cur, delta;
+
+ static struct timeval last;
+ static struct tm tm;
+
+ if (bytes != 1)
+ return (-1);
+
+ gettimeofday(&cur, NULL);
+
+ /*
+ * Increment the cached time only once per second so we can guarantee
+ * that the guest has at least one second to read the hour:min:sec
+ * separately and still get a coherent view of the time.
+ */
+ delta = cur;
+ timevalsub(&delta, &last);
+ if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
+ t = cur.tv_sec;
+ localtime_r(&t, &tm);
+ last = cur;
+ }
+
+ if (in) {
+ switch (addr) {
+ case RTC_SEC:
+ *eax = rtcout(tm.tm_sec);
+ return (0);
+ case RTC_MIN:
+ *eax = rtcout(tm.tm_min);
+ return (0);
+ case RTC_HRS:
+ if (status_b & RTCSB_24HR)
+ hour = tm.tm_hour;
+ else
+ hour = (tm.tm_hour % 12) + 1;
+
+ *eax = rtcout(hour);
+
+ /*
+ * If we are representing time in the 12-hour format
+ * then set the MSB to indicate PM.
+ */
+ if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
+ *eax |= 0x80;
+
+ return (0);
+ case RTC_WDAY:
+ *eax = rtcout(tm.tm_wday + 1);
+ return (0);
+ case RTC_DAY:
+ *eax = rtcout(tm.tm_mday);
+ return (0);
+ case RTC_MONTH:
+ *eax = rtcout(tm.tm_mon + 1);
+ return (0);
+ case RTC_YEAR:
+ *eax = rtcout(tm.tm_year % 100);
+ return (0);
+ case RTC_CENTURY:
+ *eax = rtcout(tm.tm_year / 100);
+ break;
+ case RTC_STATUSA:
+ *eax = status_a;
+ return (0);
+ case RTC_INTR:
+ *eax = 0;
+ return (0);
+ case RTC_STATUSD:
+ *eax = RTCSD_PWR;
+ return (0);
+ case RTC_DIAG:
+ *eax = 0;
+ return (0);
+ case RTC_RSTCODE:
+ *eax = rstcode;
+ return (0);
+ case RTC_EQUIPMENT:
+ *eax = 0;
+ return (0);
+ default:
+ return (-1);
+ }
+ }
+
+ switch (addr) {
+ case RTC_STATUSA:
+ status_a = *eax & ~RTCSA_TUP;
+ break;
+ case RTC_STATUSB:
+ /* XXX not implemented yet XXX */
+ if (*eax & RTCSB_PINTR)
+ return (-1);
+ status_b = *eax;
+ break;
+ case RTC_RSTCODE:
+ rstcode = *eax;
+ break;
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ /*
+ * Ignore writes to the time of day registers
+ */
+ break;
+ default:
+ return (-1);
+ }
+ return (0);
+}
+
+INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
+INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
diff --git a/usr.sbin/bhyve/spinup_ap.c b/usr.sbin/bhyve/spinup_ap.c
new file mode 100644
index 0000000..2632aed
--- /dev/null
+++ b/usr.sbin/bhyve/spinup_ap.c
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "spinup_ap.h"
+
+static void
+spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
+{
+ int vector, error;
+ uint16_t cs;
+ uint64_t desc_base;
+ uint32_t desc_limit, desc_access;
+
+ vector = *rip >> PAGE_SHIFT;
+ *rip = 0;
+
+ /*
+ * Update the %cs and %rip of the guest so that it starts
+ * executing real mode code at at 'vector << 12'.
+ */
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+ assert(error == 0);
+
+ error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+ &desc_limit, &desc_access);
+ assert(error == 0);
+
+ desc_base = vector << PAGE_SHIFT;
+ error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ assert(error == 0);
+
+ cs = (vector << PAGE_SHIFT) >> 4;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+ assert(error == 0);
+}
+
+int
+spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
+{
+ int error;
+
+ assert(newcpu != 0);
+ assert(newcpu < guest_ncpus);
+
+ error = vcpu_reset(ctx, newcpu);
+ assert(error == 0);
+
+ /* Set up capabilities */
+ if (fbsdrun_vmexit_on_hlt()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
+ assert(error == 0);
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
+ assert(error == 0);
+ }
+
+ if (fbsdrun_disable_x2apic())
+ error = vm_set_x2apic_state(ctx, newcpu, X2APIC_DISABLED);
+ else
+ error = vm_set_x2apic_state(ctx, newcpu, X2APIC_ENABLED);
+ assert(error == 0);
+
+ /*
+ * Enable the 'unrestricted guest' mode for 'newcpu'.
+ *
+ * Set up the processor state in power-on 16-bit mode, with the CS:IP
+ * init'd to the specified low-mem 4K page.
+ */
+ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ assert(error == 0);
+
+ spinup_ap_realmode(ctx, newcpu, &rip);
+
+ fbsdrun_addcpu(ctx, newcpu, rip);
+
+ return (newcpu);
+}
diff --git a/usr.sbin/bhyve/spinup_ap.h b/usr.sbin/bhyve/spinup_ap.h
new file mode 100644
index 0000000..2749ee9
--- /dev/null
+++ b/usr.sbin/bhyve/spinup_ap.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SPINUP_AP_H_
+#define _SPINUP_AP_H_
+
+int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
+
+#endif
diff --git a/usr.sbin/bhyve/uart.c b/usr.sbin/bhyve/uart.c
new file mode 100644
index 0000000..640f3bf
--- /dev/null
+++ b/usr.sbin/bhyve/uart.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define COM1 0x3F8
+#define COM2 0x2F8
+
+#define REG_IIR 2
+
+static int
+com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in);
+
+ if (bytes != 1)
+ return (-1);
+
+ /*
+ * COM port is not implemented so we return 0xFF for all registers
+ */
+ *eax = 0xFF;
+
+ return (0);
+}
+
+INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
+INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
new file mode 100644
index 0000000..474e244
--- /dev/null
+++ b/usr.sbin/bhyve/virtio.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#define VRING_ALIGN 4096
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+struct virtio_desc {
+ uint64_t vd_addr;
+ uint32_t vd_len;
+ uint16_t vd_flags;
+ uint16_t vd_next;
+} __packed;
+
+struct virtio_used {
+ uint32_t vu_idx;
+ uint32_t vu_tlen;
+} __packed;
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN 12
+
+/*
+ * Virtio device types
+ */
+#define VIRTIO_TYPE_NET 1
+#define VIRTIO_TYPE_BLOCK 2
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+
+/*
+ * PCI config space constants
+ */
+#define VTCFG_R_HOSTCAP 0
+#define VTCFG_R_GUESTCAP 4
+#define VTCFG_R_PFN 8
+#define VTCFG_R_QNUM 12
+#define VTCFG_R_QSEL 14
+#define VTCFG_R_QNOTIFY 16
+#define VTCFG_R_STATUS 18
+#define VTCFG_R_ISR 19
+#define VTCFG_R_CFG0 20 /* No MSI-X */
+#define VTCFG_R_CFG1 24 /* With MSI-X */
+#define VTCFG_R_MSIX 20
+
+#endif /* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c
new file mode 100644
index 0000000..9c05f02
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xmsr.h"
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
+{
+
+ printf("Unknown WRMSR code %x, val %lx, cpu %d\n", code, val, vcpu);
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h
new file mode 100644
index 0000000..8cebcea
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XMSR_H_
+#define _XMSR_H_
+
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+
+#endif
diff --git a/usr.sbin/bhyvectl/Makefile b/usr.sbin/bhyvectl/Makefile
new file mode 100644
index 0000000..9fde12c
--- /dev/null
+++ b/usr.sbin/bhyvectl/Makefile
@@ -0,0 +1,17 @@
+#
+# $FreeBSD$
+#
+
+PROG= bhyvectl
+SRCS= bhyvectl.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI}
+LDADD= -lvmmapi
+
+WARNS?= 3
+
+CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
new file mode 100644
index 0000000..d5e0503
--- /dev/null
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -0,0 +1,1524 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <libutil.h>
+#include <fcntl.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "intel/vmcs.h"
+
+#define MB (1UL << 20)
+#define GB (1UL << 30)
+
+#define REQ_ARG required_argument
+#define NO_ARG no_argument
+#define OPT_ARG optional_argument
+
+static const char *progname;
+
+static void
+usage(void)
+{
+
+ (void)fprintf(stderr,
+ "Usage: %s --vm=<name>\n"
+ " [--cpu=<vcpu_number>]\n"
+ " [--create]\n"
+ " [--destroy]\n"
+ " [--get-all]\n"
+ " [--get-stats]\n"
+ " [--set-desc-ds]\n"
+ " [--get-desc-ds]\n"
+ " [--set-desc-es]\n"
+ " [--get-desc-es]\n"
+ " [--set-desc-gs]\n"
+ " [--get-desc-gs]\n"
+ " [--set-desc-fs]\n"
+ " [--get-desc-fs]\n"
+ " [--set-desc-cs]\n"
+ " [--get-desc-cs]\n"
+ " [--set-desc-ss]\n"
+ " [--get-desc-ss]\n"
+ " [--set-desc-tr]\n"
+ " [--get-desc-tr]\n"
+ " [--set-desc-ldtr]\n"
+ " [--get-desc-ldtr]\n"
+ " [--set-desc-gdtr]\n"
+ " [--get-desc-gdtr]\n"
+ " [--set-desc-idtr]\n"
+ " [--get-desc-idtr]\n"
+ " [--run]\n"
+ " [--capname=<capname>]\n"
+ " [--getcap]\n"
+ " [--setcap=<0|1>]\n"
+ " [--desc-base=<BASE>]\n"
+ " [--desc-limit=<LIMIT>]\n"
+ " [--desc-access=<ACCESS>]\n"
+ " [--set-cr0=<CR0>]\n"
+ " [--get-cr0]\n"
+ " [--set-cr3=<CR3>]\n"
+ " [--get-cr3]\n"
+ " [--set-cr4=<CR4>]\n"
+ " [--get-cr4]\n"
+ " [--set-dr7=<DR7>]\n"
+ " [--get-dr7]\n"
+ " [--set-rsp=<RSP>]\n"
+ " [--get-rsp]\n"
+ " [--set-rip=<RIP>]\n"
+ " [--get-rip]\n"
+ " [--get-rax]\n"
+ " [--set-rax=<RAX>]\n"
+ " [--get-rbx]\n"
+ " [--get-rcx]\n"
+ " [--get-rdx]\n"
+ " [--get-rsi]\n"
+ " [--get-rdi]\n"
+ " [--get-rbp]\n"
+ " [--get-r8]\n"
+ " [--get-r9]\n"
+ " [--get-r10]\n"
+ " [--get-r11]\n"
+ " [--get-r12]\n"
+ " [--get-r13]\n"
+ " [--get-r14]\n"
+ " [--get-r15]\n"
+ " [--set-rflags=<RFLAGS>]\n"
+ " [--get-rflags]\n"
+ " [--set-cs]\n"
+ " [--get-cs]\n"
+ " [--set-ds]\n"
+ " [--get-ds]\n"
+ " [--set-es]\n"
+ " [--get-es]\n"
+ " [--set-fs]\n"
+ " [--get-fs]\n"
+ " [--set-gs]\n"
+ " [--get-gs]\n"
+ " [--set-ss]\n"
+ " [--get-ss]\n"
+ " [--get-tr]\n"
+ " [--get-ldtr]\n"
+ " [--get-vmcs-pinbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls2]\n"
+ " [--get-vmcs-entry-interruption-info]\n"
+ " [--set-vmcs-entry-interruption-info=<info>]\n"
+ " [--get-vmcs-eptp]\n"
+ " [--get-vmcs-guest-physical-address\n"
+ " [--get-vmcs-guest-linear-address\n"
+ " [--set-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-io-bitmap-address]\n"
+ " [--get-vmcs-tsc-offset]\n"
+ " [--get-vmcs-guest-pat]\n"
+ " [--get-vmcs-host-pat]\n"
+ " [--get-vmcs-host-cr0]\n"
+ " [--get-vmcs-host-cr3]\n"
+ " [--get-vmcs-host-cr4]\n"
+ " [--get-vmcs-host-rip]\n"
+ " [--get-vmcs-host-rsp]\n"
+ " [--get-vmcs-cr0-mask]\n"
+ " [--get-vmcs-cr0-shadow]\n"
+ " [--get-vmcs-cr4-mask]\n"
+ " [--get-vmcs-cr4-shadow]\n"
+ " [--get-vmcs-cr3-targets]\n"
+ " [--get-vmcs-apic-access-address]\n"
+ " [--get-vmcs-virtual-apic-address]\n"
+ " [--get-vmcs-tpr-threshold]\n"
+ " [--get-vmcs-msr-bitmap]\n"
+ " [--get-vmcs-msr-bitmap-address]\n"
+ " [--get-vmcs-vpid]\n"
+ " [--get-vmcs-ple-gap]\n"
+ " [--get-vmcs-ple-window]\n"
+ " [--get-vmcs-instruction-error]\n"
+ " [--get-vmcs-exit-ctls]\n"
+ " [--get-vmcs-entry-ctls]\n"
+ " [--get-vmcs-guest-sysenter]\n"
+ " [--get-vmcs-link]\n"
+ " [--get-vmcs-exit-reason]\n"
+ " [--get-vmcs-exit-qualification]\n"
+ " [--get-vmcs-exit-interruption-info]\n"
+ " [--get-vmcs-exit-interruption-error]\n"
+ " [--get-vmcs-interruptibility]\n"
+ " [--set-pinning=<host_cpuid>]\n"
+ " [--get-pinning]\n"
+ " [--set-x2apic-state=<state>]\n"
+ " [--get-x2apic-state]\n"
+ " [--set-lowmem=<memory below 4GB in units of MB>]\n"
+ " [--get-lowmem]\n"
+ " [--set-highmem=<memory above 4GB in units of MB>]\n"
+ " [--get-highmem]\n",
+ progname);
+ exit(1);
+}
+
+static int get_stats, getcap, setcap, capval;
+static const char *capname;
+static int create, destroy, get_lowmem, get_highmem;
+static uint64_t lowmem, highmem;
+static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_efer, get_efer;
+static int set_dr7, get_dr7;
+static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
+static int set_rax, get_rax;
+static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
+static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
+static int set_desc_ds, get_desc_ds;
+static int set_desc_es, get_desc_es;
+static int set_desc_fs, get_desc_fs;
+static int set_desc_gs, get_desc_gs;
+static int set_desc_cs, get_desc_cs;
+static int set_desc_ss, get_desc_ss;
+static int set_desc_gdtr, get_desc_gdtr;
+static int set_desc_idtr, get_desc_idtr;
+static int set_desc_tr, get_desc_tr;
+static int set_desc_ldtr, get_desc_ldtr;
+static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
+static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
+static int set_pinning, get_pinning, pincpu;
+static int set_x2apic_state, get_x2apic_state;
+enum x2apic_state x2apic_state;
+static int run;
+
+/*
+ * VMCS-specific fields
+ */
+static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
+static int get_eptp, get_io_bitmap, get_tsc_offset;
+static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
+static int get_vmcs_interruptibility;
+uint32_t vmcs_entry_interruption_info;
+static int get_vmcs_gpa, get_vmcs_gla;
+static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
+static int get_cr0_mask, get_cr0_shadow;
+static int get_cr4_mask, get_cr4_shadow;
+static int get_cr3_targets;
+static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
+static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_vpid, get_ple_gap, get_ple_window;
+static int get_inst_err, get_exit_ctls, get_entry_ctls;
+static int get_host_cr0, get_host_cr3, get_host_cr4;
+static int get_host_rip, get_host_rsp;
+static int get_guest_pat, get_host_pat;
+static int get_guest_sysenter, get_vmcs_link;
+static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+
+static uint64_t desc_base;
+static uint32_t desc_limit, desc_access;
+
+static int get_all;
+
+static void
+dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
+{
+ printf("vm exit[%d]\n", vcpu);
+ printf("\trip\t\t0x%016lx\n", vmexit->rip);
+ printf("\tinst_length\t%d\n", vmexit->inst_length);
+ switch (vmexit->exitcode) {
+ case VM_EXITCODE_INOUT:
+ printf("\treason\t\tINOUT\n");
+ printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
+ printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
+ printf("\tflags\t\t%s%s\n",
+ vmexit->u.inout.string ? "STRING " : "",
+ vmexit->u.inout.rep ? "REP " : "");
+ printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
+ printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
+ break;
+ case VM_EXITCODE_VMX:
+ printf("\treason\t\tVMX\n");
+ printf("\terror\t\t%d\n", vmexit->u.vmx.error);
+ printf("\texit_reason\t0x%08x (%u)\n",
+ vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
+ printf("\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+ break;
+ default:
+ printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
+ break;
+ }
+}
+
+static int
+dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+{
+ int error, fd, byte, bit, readable, writeable;
+ u_int msr;
+ const char *bitmap;
+
+ error = -1;
+ bitmap = MAP_FAILED;
+
+ fd = open("/dev/mem", O_RDONLY, 0);
+ if (fd < 0)
+ goto done;
+
+ bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+ if (bitmap == MAP_FAILED)
+ goto done;
+
+ for (msr = 0; msr < 0x2000; msr++) {
+ byte = msr / 8;
+ bit = msr & 0x7;
+
+ /* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+
+ /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+ byte += 1024;
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n",
+ 0xc0000000 + msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+ }
+
+ error = 0;
+done:
+ if (bitmap != MAP_FAILED)
+ munmap((void *)bitmap, PAGE_SIZE);
+ if (fd >= 0)
+ close(fd);
+ return (error);
+}
+
+static int
+vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
+{
+
+ return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
+}
+
+static int
+vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
+{
+
+ return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
+}
+
+enum {
+ VMNAME = 1000, /* avoid collision with return values from getopt */
+ VCPU,
+ SET_LOWMEM,
+ SET_HIGHMEM,
+ SET_EFER,
+ SET_CR0,
+ SET_CR3,
+ SET_CR4,
+ SET_DR7,
+ SET_RSP,
+ SET_RIP,
+ SET_RAX,
+ SET_RFLAGS,
+ DESC_BASE,
+ DESC_LIMIT,
+ DESC_ACCESS,
+ SET_CS,
+ SET_DS,
+ SET_ES,
+ SET_FS,
+ SET_GS,
+ SET_SS,
+ SET_TR,
+ SET_LDTR,
+ SET_PINNING,
+ SET_X2APIC_STATE,
+ SET_VMCS_EXCEPTION_BITMAP,
+ SET_VMCS_ENTRY_INTERRUPTION_INFO,
+ SET_CAP,
+ CAPNAME,
+};
+
+int
+main(int argc, char *argv[])
+{
+ char *vmname;
+ int error, ch, vcpu;
+ vm_paddr_t gpa;
+ size_t len;
+ struct vm_exit vmexit;
+ uint64_t ctl, eptp, bm, addr, u64;
+ struct vmctx *ctx;
+
+ uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+ uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
+ uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
+ uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+
+ struct option opts[] = {
+ { "vm", REQ_ARG, 0, VMNAME },
+ { "cpu", REQ_ARG, 0, VCPU },
+ { "set-lowmem", REQ_ARG, 0, SET_LOWMEM },
+ { "set-highmem",REQ_ARG, 0, SET_HIGHMEM },
+ { "set-efer", REQ_ARG, 0, SET_EFER },
+ { "set-cr0", REQ_ARG, 0, SET_CR0 },
+ { "set-cr3", REQ_ARG, 0, SET_CR3 },
+ { "set-cr4", REQ_ARG, 0, SET_CR4 },
+ { "set-dr7", REQ_ARG, 0, SET_DR7 },
+ { "set-rsp", REQ_ARG, 0, SET_RSP },
+ { "set-rip", REQ_ARG, 0, SET_RIP },
+ { "set-rax", REQ_ARG, 0, SET_RAX },
+ { "set-rflags", REQ_ARG, 0, SET_RFLAGS },
+ { "desc-base", REQ_ARG, 0, DESC_BASE },
+ { "desc-limit", REQ_ARG, 0, DESC_LIMIT },
+ { "desc-access",REQ_ARG, 0, DESC_ACCESS },
+ { "set-cs", REQ_ARG, 0, SET_CS },
+ { "set-ds", REQ_ARG, 0, SET_DS },
+ { "set-es", REQ_ARG, 0, SET_ES },
+ { "set-fs", REQ_ARG, 0, SET_FS },
+ { "set-gs", REQ_ARG, 0, SET_GS },
+ { "set-ss", REQ_ARG, 0, SET_SS },
+ { "set-tr", REQ_ARG, 0, SET_TR },
+ { "set-ldtr", REQ_ARG, 0, SET_LDTR },
+ { "set-pinning",REQ_ARG, 0, SET_PINNING },
+ { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE },
+ { "set-vmcs-exception-bitmap",
+ REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP },
+ { "set-vmcs-entry-interruption-info",
+ REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+ { "capname", REQ_ARG, 0, CAPNAME },
+ { "setcap", REQ_ARG, 0, SET_CAP },
+ { "getcap", NO_ARG, &getcap, 1 },
+ { "get-stats", NO_ARG, &get_stats, 1 },
+ { "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
+ { "set-desc-ds",NO_ARG, &set_desc_ds, 1 },
+ { "get-desc-es",NO_ARG, &get_desc_es, 1 },
+ { "set-desc-es",NO_ARG, &set_desc_es, 1 },
+ { "get-desc-ss",NO_ARG, &get_desc_ss, 1 },
+ { "set-desc-ss",NO_ARG, &set_desc_ss, 1 },
+ { "get-desc-cs",NO_ARG, &get_desc_cs, 1 },
+ { "set-desc-cs",NO_ARG, &set_desc_cs, 1 },
+ { "get-desc-fs",NO_ARG, &get_desc_fs, 1 },
+ { "set-desc-fs",NO_ARG, &set_desc_fs, 1 },
+ { "get-desc-gs",NO_ARG, &get_desc_gs, 1 },
+ { "set-desc-gs",NO_ARG, &set_desc_gs, 1 },
+ { "get-desc-tr",NO_ARG, &get_desc_tr, 1 },
+ { "set-desc-tr",NO_ARG, &set_desc_tr, 1 },
+ { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 },
+ { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 },
+ { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 },
+ { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 },
+ { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 },
+ { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 },
+ { "get-lowmem", NO_ARG, &get_lowmem, 1 },
+ { "get-highmem",NO_ARG, &get_highmem, 1 },
+ { "get-efer", NO_ARG, &get_efer, 1 },
+ { "get-cr0", NO_ARG, &get_cr0, 1 },
+ { "get-cr3", NO_ARG, &get_cr3, 1 },
+ { "get-cr4", NO_ARG, &get_cr4, 1 },
+ { "get-dr7", NO_ARG, &get_dr7, 1 },
+ { "get-rsp", NO_ARG, &get_rsp, 1 },
+ { "get-rip", NO_ARG, &get_rip, 1 },
+ { "get-rax", NO_ARG, &get_rax, 1 },
+ { "get-rbx", NO_ARG, &get_rbx, 1 },
+ { "get-rcx", NO_ARG, &get_rcx, 1 },
+ { "get-rdx", NO_ARG, &get_rdx, 1 },
+ { "get-rsi", NO_ARG, &get_rsi, 1 },
+ { "get-rdi", NO_ARG, &get_rdi, 1 },
+ { "get-rbp", NO_ARG, &get_rbp, 1 },
+ { "get-r8", NO_ARG, &get_r8, 1 },
+ { "get-r9", NO_ARG, &get_r9, 1 },
+ { "get-r10", NO_ARG, &get_r10, 1 },
+ { "get-r11", NO_ARG, &get_r11, 1 },
+ { "get-r12", NO_ARG, &get_r12, 1 },
+ { "get-r13", NO_ARG, &get_r13, 1 },
+ { "get-r14", NO_ARG, &get_r14, 1 },
+ { "get-r15", NO_ARG, &get_r15, 1 },
+ { "get-rflags", NO_ARG, &get_rflags, 1 },
+ { "get-cs", NO_ARG, &get_cs, 1 },
+ { "get-ds", NO_ARG, &get_ds, 1 },
+ { "get-es", NO_ARG, &get_es, 1 },
+ { "get-fs", NO_ARG, &get_fs, 1 },
+ { "get-gs", NO_ARG, &get_gs, 1 },
+ { "get-ss", NO_ARG, &get_ss, 1 },
+ { "get-tr", NO_ARG, &get_tr, 1 },
+ { "get-ldtr", NO_ARG, &get_ldtr, 1 },
+ { "get-vmcs-pinbased-ctls",
+ NO_ARG, &get_pinbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls",
+ NO_ARG, &get_procbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls2",
+ NO_ARG, &get_procbased_ctls2, 1 },
+ { "get-vmcs-guest-linear-address",
+ NO_ARG, &get_vmcs_gla, 1 },
+ { "get-vmcs-guest-physical-address",
+ NO_ARG, &get_vmcs_gpa, 1 },
+ { "get-vmcs-entry-interruption-info",
+ NO_ARG, &get_vmcs_entry_interruption_info, 1},
+ { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 },
+ { "get-vmcs-exception-bitmap",
+ NO_ARG, &get_exception_bitmap, 1 },
+ { "get-vmcs-io-bitmap-address",
+ NO_ARG, &get_io_bitmap, 1 },
+ { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
+ { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 },
+ { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+ { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 },
+ { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
+ { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
+ { "get-vmcs-apic-access-address",
+ NO_ARG, &get_apic_access_addr, 1},
+ { "get-vmcs-virtual-apic-address",
+ NO_ARG, &get_virtual_apic_addr, 1},
+ { "get-vmcs-tpr-threshold",
+ NO_ARG, &get_tpr_threshold, 1 },
+ { "get-vmcs-msr-bitmap",
+ NO_ARG, &get_msr_bitmap, 1 },
+ { "get-vmcs-msr-bitmap-address",
+ NO_ARG, &get_msr_bitmap_address, 1 },
+ { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 },
+ { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 },
+ { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
+ { "get-vmcs-instruction-error",
+ NO_ARG, &get_inst_err, 1 },
+ { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 },
+ { "get-vmcs-entry-ctls",
+ NO_ARG, &get_entry_ctls, 1 },
+ { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 },
+ { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 },
+ { "get-vmcs-host-cr0",
+ NO_ARG, &get_host_cr0, 1 },
+ { "get-vmcs-host-cr3",
+ NO_ARG, &get_host_cr3, 1 },
+ { "get-vmcs-host-cr4",
+ NO_ARG, &get_host_cr4, 1 },
+ { "get-vmcs-host-rip",
+ NO_ARG, &get_host_rip, 1 },
+ { "get-vmcs-host-rsp",
+ NO_ARG, &get_host_rsp, 1 },
+ { "get-vmcs-guest-sysenter",
+ NO_ARG, &get_guest_sysenter, 1 },
+ { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 },
+ { "get-vmcs-exit-reason",
+ NO_ARG, &get_vmcs_exit_reason, 1 },
+ { "get-vmcs-exit-qualification",
+ NO_ARG, &get_vmcs_exit_qualification, 1 },
+ { "get-vmcs-exit-interruption-info",
+ NO_ARG, &get_vmcs_exit_interruption_info, 1},
+ { "get-vmcs-exit-interruption-error",
+ NO_ARG, &get_vmcs_exit_interruption_error, 1},
+ { "get-vmcs-interruptibility",
+ NO_ARG, &get_vmcs_interruptibility, 1 },
+ { "get-pinning",NO_ARG, &get_pinning, 1 },
+ { "get-x2apic-state",NO_ARG, &get_x2apic_state, 1 },
+ { "get-all", NO_ARG, &get_all, 1 },
+ { "run", NO_ARG, &run, 1 },
+ { "create", NO_ARG, &create, 1 },
+ { "destroy", NO_ARG, &destroy, 1 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ vcpu = 0;
+ progname = basename(argv[0]);
+
+ while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+ switch (ch) {
+ case 0:
+ break;
+ case VMNAME:
+ vmname = optarg;
+ break;
+ case VCPU:
+ vcpu = atoi(optarg);
+ break;
+ case SET_LOWMEM:
+ lowmem = atoi(optarg) * MB;
+ lowmem = roundup(lowmem, 2 * MB);
+ break;
+ case SET_HIGHMEM:
+ highmem = atoi(optarg) * MB;
+ highmem = roundup(highmem, 2 * MB);
+ break;
+ case SET_EFER:
+ efer = strtoul(optarg, NULL, 0);
+ set_efer = 1;
+ break;
+ case SET_CR0:
+ cr0 = strtoul(optarg, NULL, 0);
+ set_cr0 = 1;
+ break;
+ case SET_CR3:
+ cr3 = strtoul(optarg, NULL, 0);
+ set_cr3 = 1;
+ break;
+ case SET_CR4:
+ cr4 = strtoul(optarg, NULL, 0);
+ set_cr4 = 1;
+ break;
+ case SET_DR7:
+ dr7 = strtoul(optarg, NULL, 0);
+ set_dr7 = 1;
+ break;
+ case SET_RSP:
+ rsp = strtoul(optarg, NULL, 0);
+ set_rsp = 1;
+ break;
+ case SET_RIP:
+ rip = strtoul(optarg, NULL, 0);
+ set_rip = 1;
+ break;
+ case SET_RAX:
+ rax = strtoul(optarg, NULL, 0);
+ set_rax = 1;
+ break;
+ case SET_RFLAGS:
+ rflags = strtoul(optarg, NULL, 0);
+ set_rflags = 1;
+ break;
+ case DESC_BASE:
+ desc_base = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_LIMIT:
+ desc_limit = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_ACCESS:
+ desc_access = strtoul(optarg, NULL, 0);
+ break;
+ case SET_CS:
+ cs = strtoul(optarg, NULL, 0);
+ set_cs = 1;
+ break;
+ case SET_DS:
+ ds = strtoul(optarg, NULL, 0);
+ set_ds = 1;
+ break;
+ case SET_ES:
+ es = strtoul(optarg, NULL, 0);
+ set_es = 1;
+ break;
+ case SET_FS:
+ fs = strtoul(optarg, NULL, 0);
+ set_fs = 1;
+ break;
+ case SET_GS:
+ gs = strtoul(optarg, NULL, 0);
+ set_gs = 1;
+ break;
+ case SET_SS:
+ ss = strtoul(optarg, NULL, 0);
+ set_ss = 1;
+ break;
+ case SET_TR:
+ tr = strtoul(optarg, NULL, 0);
+ set_tr = 1;
+ break;
+ case SET_LDTR:
+ ldtr = strtoul(optarg, NULL, 0);
+ set_ldtr = 1;
+ break;
+ case SET_PINNING:
+ pincpu = strtol(optarg, NULL, 0);
+ set_pinning = 1;
+ break;
+ case SET_X2APIC_STATE:
+ x2apic_state = strtol(optarg, NULL, 0);
+ set_x2apic_state = 1;
+ break;
+ case SET_VMCS_EXCEPTION_BITMAP:
+ exception_bitmap = strtoul(optarg, NULL, 0);
+ set_exception_bitmap = 1;
+ break;
+ case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+ vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+ set_vmcs_entry_interruption_info = 1;
+ break;
+ case SET_CAP:
+ capval = strtoul(optarg, NULL, 0);
+ setcap = 1;
+ break;
+ case CAPNAME:
+ capname = optarg;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (vmname == NULL)
+ usage();
+
+ error = 0;
+
+ if (!error && create)
+ error = vm_create(vmname);
+
+ if (!error) {
+ ctx = vm_open(vmname);
+ if (ctx == NULL)
+ error = -1;
+ }
+
+ if (!error && lowmem)
+ error = vm_setup_memory(ctx, 0, lowmem, NULL);
+
+ if (!error && highmem)
+ error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+
+ if (!error && set_efer)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+ if (!error && set_cr0)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+ if (!error && set_cr3)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+ if (!error && set_cr4)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+ if (!error && set_dr7)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+ if (!error && set_rsp)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+ if (!error && set_rip)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+ if (!error && set_rax)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+ if (!error && set_rflags) {
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ rflags);
+ }
+
+ if (!error && set_desc_ds) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_es) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ss) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_cs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_fs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_tr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ldtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gdtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_desc_idtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_cs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+ if (!error && set_ds)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+ if (!error && set_es)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+ if (!error && set_fs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+ if (!error && set_gs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+ if (!error && set_ss)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+ if (!error && set_tr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+ if (!error && set_ldtr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+ if (!error && set_pinning)
+ error = vm_set_pinning(ctx, vcpu, pincpu);
+
+ if (!error && set_x2apic_state)
+ error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
+
+ if (!error && set_exception_bitmap) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ exception_bitmap);
+ }
+
+ if (!error && set_vmcs_entry_interruption_info) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+ vmcs_entry_interruption_info);
+ }
+
+ if (!error && (get_lowmem || get_all)) {
+ gpa = 0;
+ error = vm_get_memory_seg(ctx, gpa, &len);
+ if (error == 0)
+ printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
+ }
+
+ if (!error && (get_highmem || get_all)) {
+ gpa = 4 * GB;
+ error = vm_get_memory_seg(ctx, gpa, &len);
+ if (error == 0)
+ printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
+ }
+
+ if (!error && (get_efer || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+ if (error == 0)
+ printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+ }
+
+ if (!error && (get_cr0 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+ if (error == 0)
+ printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && (get_cr3 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+ if (error == 0)
+ printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && (get_cr4 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+ if (error == 0)
+ printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && (get_dr7 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
+ if (error == 0)
+ printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
+ }
+
+ if (!error && (get_rsp || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
+ if (error == 0)
+ printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && (get_rip || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ if (error == 0)
+ printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_rax || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
+ if (error == 0)
+ printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
+ }
+
+ if (!error && (get_rbx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
+ if (error == 0)
+ printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
+ }
+
+ if (!error && (get_rcx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
+ if (error == 0)
+ printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
+ }
+
+ if (!error && (get_rdx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
+ if (error == 0)
+ printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
+ }
+
+ if (!error && (get_rsi || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
+ if (error == 0)
+ printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
+ }
+
+ if (!error && (get_rdi || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
+ if (error == 0)
+ printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
+ }
+
+ if (!error && (get_rbp || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
+ if (error == 0)
+ printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
+ }
+
+ if (!error && (get_r8 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
+ if (error == 0)
+ printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
+ }
+
+ if (!error && (get_r9 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
+ if (error == 0)
+ printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
+ }
+
+ if (!error && (get_r10 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
+ if (error == 0)
+ printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
+ }
+
+ if (!error && (get_r11 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
+ if (error == 0)
+ printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
+ }
+
+ if (!error && (get_r12 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
+ if (error == 0)
+ printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
+ }
+
+ if (!error && (get_r13 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
+ if (error == 0)
+ printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
+ }
+
+ if (!error && (get_r14 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
+ if (error == 0)
+ printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
+ }
+
+ if (!error && (get_r15 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
+ if (error == 0)
+ printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
+ }
+
+ if (!error && (get_rflags || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ &rflags);
+ if (error == 0)
+ printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
+ }
+
+ if (!error && (get_stats || get_all)) {
+ int i, num_stats;
+ uint64_t *stats;
+ struct timeval tv;
+ const char *desc;
+
+ stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+ if (stats != NULL) {
+ printf("vcpu%d\n", vcpu);
+ for (i = 0; i < num_stats; i++) {
+ desc = vm_get_stat_desc(ctx, i);
+ printf("%-32s\t%ld\n", desc, stats[i]);
+ }
+ }
+ }
+
+ if (!error && (get_desc_ds || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_es || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_fs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_gs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_ss || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_cs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_tr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_ldtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_gdtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && (get_desc_idtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && (get_cs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
+ if (error == 0)
+ printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
+ }
+
+ if (!error && (get_ds || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
+ if (error == 0)
+ printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
+ }
+
+ if (!error && (get_es || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
+ if (error == 0)
+ printf("es[%d]\t\t0x%04lx\n", vcpu, es);
+ }
+
+ if (!error && (get_fs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
+ if (error == 0)
+ printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
+ }
+
+ if (!error && (get_gs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
+ if (error == 0)
+ printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
+ }
+
+ if (!error && (get_ss || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
+ if (error == 0)
+ printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
+ }
+
+ if (!error && (get_tr || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
+ if (error == 0)
+ printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
+ }
+
+ if (!error && (get_ldtr || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
+ if (error == 0)
+ printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
+ }
+
+ if (!error && (get_pinning || get_all)) {
+ error = vm_get_pinning(ctx, vcpu, &pincpu);
+ if (error == 0) {
+ if (pincpu < 0)
+ printf("pincpu[%d]\tunpinned\n", vcpu);
+ else
+ printf("pincpu[%d]\t%d\n", vcpu, pincpu);
+ }
+ }
+
+ if (!error && (get_x2apic_state || get_all)) {
+ error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
+ if (error == 0)
+ printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
+ }
+
+ if (!error && (get_pinbased_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_procbased_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_PRI_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_procbased_ctls2 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_SEC_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_vmcs_gla || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_LINEAR_ADDRESS, &u64);
+ if (error == 0)
+ printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_gpa || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
+ if (error == 0)
+ printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_entry_interruption_info || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+ if (error == 0) {
+ printf("entry_interruption_info[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_eptp || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+ if (error == 0)
+ printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
+ }
+
+ if (!error && (get_exception_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ &bm);
+ if (error == 0)
+ printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm);
+ }
+
+ if (!error && (get_io_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
+ if (error == 0)
+ printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm);
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
+ if (error == 0)
+ printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm);
+ }
+
+ if (!error && (get_tsc_offset || get_all)) {
+ uint64_t tscoff;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
+ if (error == 0)
+ printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
+ }
+
+ if (!error && (get_cr0_mask || get_all)) {
+ uint64_t cr0mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
+ if (error == 0)
+ printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
+ }
+
+ if (!error && (get_cr0_shadow || get_all)) {
+ uint64_t cr0shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
+ &cr0shadow);
+ if (error == 0)
+ printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
+ }
+
+ if (!error && (get_cr4_mask || get_all)) {
+ uint64_t cr4mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
+ if (error == 0)
+ printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
+ }
+
+ if (!error && (get_cr4_shadow || get_all)) {
+ uint64_t cr4shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
+ &cr4shadow);
+ if (error == 0)
+ printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
+ }
+
+ if (!error && (get_cr3_targets || get_all)) {
+ uint64_t target_count, target_addr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
+ &target_count);
+ if (error == 0) {
+ printf("cr3_target_count[%d]\t0x%08lx\n",
+ vcpu, target_count);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target0[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target1[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target2[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target3[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+ }
+
+ if (!error && (get_apic_access_addr || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+ if (error == 0)
+ printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_virtual_apic_addr || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+ if (error == 0)
+ printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_tpr_threshold || get_all)) {
+ uint64_t threshold;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+ &threshold);
+ if (error == 0)
+ printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold);
+ }
+
+ if (!error && (get_msr_bitmap_address || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_msr_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ error = dump_vmcs_msr_bitmap(vcpu, addr);
+ }
+
+ if (!error && (get_vpid || get_all)) {
+ uint64_t vpid;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+ if (error == 0)
+ printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid);
+ }
+
+ if (!error && (get_ple_window || get_all)) {
+ uint64_t window;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
+ if (error == 0)
+ printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window);
+ }
+
+ if (!error && (get_ple_gap || get_all)) {
+ uint64_t gap;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+ if (error == 0)
+ printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap);
+ }
+
+ if (!error && (get_inst_err || get_all)) {
+ uint64_t insterr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
+ &insterr);
+ if (error == 0) {
+ printf("instruction_error[%d]\t0x%08lx\n",
+ vcpu, insterr);
+ }
+ }
+
+ if (!error && (get_exit_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
+ if (error == 0)
+ printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_entry_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
+ if (error == 0)
+ printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_host_pat || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && (get_guest_pat || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && (get_host_cr0 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
+ if (error == 0)
+ printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && (get_host_cr3 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
+ if (error == 0)
+ printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && (get_host_cr4 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
+ if (error == 0)
+ printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && (get_host_rip || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
+ if (error == 0)
+ printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_host_rsp || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
+ if (error == 0)
+ printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && (get_guest_sysenter || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_CS, &cs);
+ if (error == 0)
+ printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs);
+
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
+ if (error == 0)
+ printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
+ if (error == 0)
+ printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_vmcs_link || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
+ if (error == 0)
+ printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_vmcs_exit_reason || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
+ if (error == 0)
+ printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_exit_qualification || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+ &u64);
+ if (error == 0)
+ printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+ vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_exit_interruption_info || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_INFO, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_vmcs_exit_interruption_error || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_ERROR, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_vmcs_interruptibility || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_INTERRUPTIBILITY, &u64);
+ if (error == 0) {
+ printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && setcap) {
+ int captype;
+ captype = vm_capability_name2type(capname);
+ error = vm_set_capability(ctx, vcpu, captype, capval);
+ if (error != 0 && errno == ENOENT)
+ printf("Capability \"%s\" is not available\n", capname);
+ }
+
+ if (!error && (getcap || get_all)) {
+ int captype, val, getcaptype;
+
+ if (getcap && capname)
+ getcaptype = vm_capability_name2type(capname);
+ else
+ getcaptype = -1;
+
+ for (captype = 0; captype < VM_CAP_MAX; captype++) {
+ if (getcaptype >= 0 && captype != getcaptype)
+ continue;
+ error = vm_get_capability(ctx, vcpu, captype, &val);
+ if (error == 0) {
+ printf("Capability \"%s\" is %s on vcpu %d\n",
+ vm_capability_type2name(captype),
+ val ? "set" : "not set", vcpu);
+ } else if (errno == ENOENT) {
+ printf("Capability \"%s\" is not available\n",
+ vm_capability_type2name(captype));
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (!error && run) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ error = vm_run(ctx, vcpu, rip, &vmexit);
+ if (error == 0)
+ dump_vm_run_exitcode(&vmexit, vcpu);
+ else
+ printf("vm_run error %d\n", error);
+ }
+
+ if (error)
+ printf("errno = %d\n", errno);
+
+ if (!error && destroy)
+ vm_destroy(ctx);
+
+ exit(error);
+}
diff --git a/usr.sbin/bhyveload/Makefile b/usr.sbin/bhyveload/Makefile
new file mode 100644
index 0000000..7b00818
--- /dev/null
+++ b/usr.sbin/bhyveload/Makefile
@@ -0,0 +1,14 @@
+# $FreeBSD$
+
+PROG= bhyveload
+SRCS= bhyveload.c
+MAN= bhyveload.8
+
+DPADD+= ${LIBVMMAPI}
+LDADD+= -lvmmapi
+
+WARNS?= 3
+
+CFLAGS+=-I${.CURDIR}/../../sys/boot/userboot
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8
new file mode 100644
index 0000000..2918c4c
--- /dev/null
+++ b/usr.sbin/bhyveload/bhyveload.8
@@ -0,0 +1,130 @@
+.\"
+.\" Copyright (c) 2012 NetApp Inc
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd January 7, 2012
+.Dt BHYVELOAD 8
+.Os
+.Sh NAME
+.Nm bhyveload
+.Nd load a
+.Fx
+guest inside a bhyve virtual machine
+.Sh SYNOPSIS
+.Nm
+.Op Fl m Ar lowmem
+.Op Fl M Ar highmem
+.Op Fl d Ar disk-path
+.Op Fl h Ar host-path
+.Ar vmname
+.Sh DESCRIPTION
+.Nm
+is used to load a
+.Fx
+guest inside a
+.Xr bhyve 4
+virtual machine.
+.Pp
+.Nm
+is based on
+.Xr loader 8
+and will present an interface identical to
+.Fx
+loader on the user's terminal.
+.Pp
+The virtual machine is identified as
+.Ar vmname
+and will be created if it does not already exist.
+.Sh OPTIONS
+The following options are available:
+.Bl -tag -width indent
+.It Fl m Ar lowmem
+.Ar lowmem
+is the amount of memory allocated below 4GB in the guest's physical address
+space.
+.Pp
+The default value of
+.Ar lowmem
+is 256MB.
+.It Fl M Ar highmem
+.Ar highmem
+is the amount of memory allocated above 4GB in the guest's physical address
+space.
+.Pp
+The default value of
+.Ar highmem
+is 0MB.
+.It Fl d Ar disk-path
+The
+.Ar disk-path
+is the pathname of the guest's boot disk image.
+.It Fl h Ar host-path
+The
+.Ar host-path
+is the directory at the top of the guest's boot filesystem.
+
+.Sh EXAMPLES
+To create a virtual machine named
+.Ar freebsd-vm
+that boots off the ISO image
+.Pa /freebsd/release.iso
+and has 1GB memory allocated to it:
+
+.Dl "bhyveload -m 256 -M 768 -d /freebsd/release.iso freebsd-vm
+
+In the example above the 1GB allocation is split in two segments:
+.Bl -dash -compact
+.It
+256MB below the 4GB boundary (0MB - 256MB)
+.It
+768MB above the 4GB boundary (4096MB - 4864MB)
+.El
+
+.Sh SEE ALSO
+.Xr bhyve 4 ,
+.Xr bhyve 8 ,
+.Xr loader 8 ,
+.Xr vmm 4
+
+.Sh HISTORY
+.Nm
+first appeared in
+.Fx 10.0 ,
+and was developed at NetApp Inc.
+
+.Sh AUTHORS
+.Nm
+was developed by
+.An -nosplit
+.An "Neel Natu" Aq neel@FreeBSD.org
+at NetApp Inc with a lot of help from
+.An Doug Rabson Aq dfr@FreeBSD.org
+
+.Sh BUGS
+.Nm
+can load only
+.Fx
+as a guest.
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
new file mode 100644
index 0000000..ef12d9f
--- /dev/null
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -0,0 +1,652 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * Copyright (c) 2011 Google, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#include <dirent.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include <vmmapi.h>
+
+#include "userboot.h"
+
+#define MB (1024 * 1024UL)
+#define GB (1024 * 1024 * 1024UL)
+#define BSP 0
+
+static char *host_base = "/";
+static struct termios term, oldterm;
+static int disk_fd = -1;
+
+static char *vmname, *progname, *membase;
+static uint64_t lowmem, highmem;
+static struct vmctx *ctx;
+
+static uint64_t gdtbase, cr3, rsp;
+
+static void cb_exit(void *arg, int v);
+
+/*
+ * Console i/o callbacks
+ */
+
+static void
+cb_putc(void *arg, int ch)
+{
+ char c = ch;
+
+ write(1, &c, 1);
+}
+
+static int
+cb_getc(void *arg)
+{
+ char c;
+
+ if (read(0, &c, 1) == 1)
+ return (c);
+ return (-1);
+}
+
+static int
+cb_poll(void *arg)
+{
+ int n;
+
+ if (ioctl(0, FIONREAD, &n) >= 0)
+ return (n > 0);
+ return (0);
+}
+
+/*
+ * Host filesystem i/o callbacks
+ */
+
+struct cb_file {
+ int cf_isdir;
+ size_t cf_size;
+ struct stat cf_stat;
+ union {
+ int fd;
+ DIR *dir;
+ } cf_u;
+};
+
+static int
+cb_open(void *arg, const char *filename, void **hp)
+{
+ struct stat st;
+ struct cb_file *cf;
+ char path[PATH_MAX];
+
+ if (!host_base)
+ return (ENOENT);
+
+ strlcpy(path, host_base, PATH_MAX);
+ if (path[strlen(path) - 1] == '/')
+ path[strlen(path) - 1] = 0;
+ strlcat(path, filename, PATH_MAX);
+ cf = malloc(sizeof(struct cb_file));
+ if (stat(path, &cf->cf_stat) < 0) {
+ free(cf);
+ return (errno);
+ }
+
+ cf->cf_size = st.st_size;
+ if (S_ISDIR(cf->cf_stat.st_mode)) {
+ cf->cf_isdir = 1;
+ cf->cf_u.dir = opendir(path);
+ if (!cf->cf_u.dir)
+ goto out;
+ *hp = cf;
+ return (0);
+ }
+ if (S_ISREG(cf->cf_stat.st_mode)) {
+ cf->cf_isdir = 0;
+ cf->cf_u.fd = open(path, O_RDONLY);
+ if (cf->cf_u.fd < 0)
+ goto out;
+ *hp = cf;
+ return (0);
+ }
+
+out:
+ free(cf);
+ return (EINVAL);
+}
+
+static int
+cb_close(void *arg, void *h)
+{
+ struct cb_file *cf = h;
+
+ if (cf->cf_isdir)
+ closedir(cf->cf_u.dir);
+ else
+ close(cf->cf_u.fd);
+ free(cf);
+
+ return (0);
+}
+
+static int
+cb_isdir(void *arg, void *h)
+{
+ struct cb_file *cf = h;
+
+ return (cf->cf_isdir);
+}
+
+static int
+cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid)
+{
+ struct cb_file *cf = h;
+ ssize_t sz;
+
+ if (cf->cf_isdir)
+ return (EINVAL);
+ sz = read(cf->cf_u.fd, buf, size);
+ if (sz < 0)
+ return (EINVAL);
+ *resid = size - sz;
+ return (0);
+}
+
+static int
+cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
+ size_t *namelen_return, char *name)
+{
+ struct cb_file *cf = h;
+ struct dirent *dp;
+
+ if (!cf->cf_isdir)
+ return (EINVAL);
+
+ dp = readdir(cf->cf_u.dir);
+ if (!dp)
+ return (ENOENT);
+
+ /*
+ * Note: d_namlen is in the range 0..255 and therefore less
+ * than PATH_MAX so we don't need to test before copying.
+ */
+ *fileno_return = dp->d_fileno;
+ *type_return = dp->d_type;
+ *namelen_return = dp->d_namlen;
+ memcpy(name, dp->d_name, dp->d_namlen);
+ name[dp->d_namlen] = 0;
+
+ return (0);
+}
+
+static int
+cb_seek(void *arg, void *h, uint64_t offset, int whence)
+{
+ struct cb_file *cf = h;
+
+ if (cf->cf_isdir)
+ return (EINVAL);
+ if (lseek(cf->cf_u.fd, offset, whence) < 0)
+ return (errno);
+ return (0);
+}
+
+static int
+cb_stat(void *arg, void *h, int *mode, int *uid, int *gid, uint64_t *size)
+{
+ struct cb_file *cf = h;
+
+ *mode = cf->cf_stat.st_mode;
+ *uid = cf->cf_stat.st_uid;
+ *gid = cf->cf_stat.st_gid;
+ *size = cf->cf_stat.st_size;
+ return (0);
+}
+
+/*
+ * Disk image i/o callbacks
+ */
+
+static int
+cb_diskread(void *arg, int unit, uint64_t from, void *to, size_t size,
+ size_t *resid)
+{
+ ssize_t n;
+
+ if (unit != 0 || disk_fd == -1)
+ return (EIO);
+ n = pread(disk_fd, to, size, from);
+ if (n < 0)
+ return (errno);
+ *resid = size - n;
+ return (0);
+}
+
+static int
+cb_diskioctl(void *arg, int unit, u_long cmd, void *data)
+{
+ struct stat sb;
+
+ if (unit != 0 || disk_fd == -1)
+ return (EBADF);
+
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(u_int *)data = 512;
+ break;
+ case DIOCGMEDIASIZE:
+ if (fstat(disk_fd, &sb) == 0)
+ *(off_t *)data = sb.st_size;
+ else
+ return (ENOTTY);
+ break;
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+/*
+ * Guest virtual machine i/o callbacks
+ */
+static int
+cb_copyin(void *arg, const void *from, uint64_t to, size_t size)
+{
+
+ to &= 0x7fffffff;
+ if (to > lowmem)
+ return (EFAULT);
+ if (to + size > lowmem)
+ size = lowmem - to;
+
+ memcpy(&membase[to], from, size);
+
+ return (0);
+}
+
+static int
+cb_copyout(void *arg, uint64_t from, void *to, size_t size)
+{
+
+ from &= 0x7fffffff;
+ if (from > lowmem)
+ return (EFAULT);
+ if (from + size > lowmem)
+ size = lowmem - from;
+
+ memcpy(to, &membase[from], size);
+
+ return (0);
+}
+
+static void
+cb_setreg(void *arg, int r, uint64_t v)
+{
+ int error;
+ enum vm_reg_name vmreg;
+
+ vmreg = VM_REG_LAST;
+
+ switch (r) {
+ case 4:
+ vmreg = VM_REG_GUEST_RSP;
+ rsp = v;
+ break;
+ default:
+ break;
+ }
+
+ if (vmreg == VM_REG_LAST) {
+ printf("test_setreg(%d): not implemented\n", r);
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ error = vm_set_register(ctx, BSP, vmreg, v);
+ if (error) {
+ perror("vm_set_register");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+}
+
+static void
+cb_setmsr(void *arg, int r, uint64_t v)
+{
+ int error;
+ enum vm_reg_name vmreg;
+
+ vmreg = VM_REG_LAST;
+
+ switch (r) {
+ case MSR_EFER:
+ vmreg = VM_REG_GUEST_EFER;
+ break;
+ default:
+ break;
+ }
+
+ if (vmreg == VM_REG_LAST) {
+ printf("test_setmsr(%d): not implemented\n", r);
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ error = vm_set_register(ctx, BSP, vmreg, v);
+ if (error) {
+ perror("vm_set_msr");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+}
+
+static void
+cb_setcr(void *arg, int r, uint64_t v)
+{
+ int error;
+ enum vm_reg_name vmreg;
+
+ vmreg = VM_REG_LAST;
+
+ switch (r) {
+ case 0:
+ vmreg = VM_REG_GUEST_CR0;
+ break;
+ case 3:
+ vmreg = VM_REG_GUEST_CR3;
+ cr3 = v;
+ break;
+ case 4:
+ vmreg = VM_REG_GUEST_CR4;
+ break;
+ default:
+ break;
+ }
+
+ if (vmreg == VM_REG_LAST) {
+ printf("test_setcr(%d): not implemented\n", r);
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ error = vm_set_register(ctx, BSP, vmreg, v);
+ if (error) {
+ perror("vm_set_cr");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+}
+
+static void
+cb_setgdt(void *arg, uint64_t base, size_t size)
+{
+ int error;
+
+ error = vm_set_desc(ctx, BSP, VM_REG_GUEST_GDTR, base, size - 1, 0);
+ if (error != 0) {
+ perror("vm_set_desc(gdt)");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ gdtbase = base;
+}
+
+static void
+cb_exec(void *arg, uint64_t rip)
+{
+ int error;
+
+ error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase, rsp);
+ if (error) {
+ perror("vm_setup_freebsd_registers");
+ cb_exit(NULL, USERBOOT_EXIT_QUIT);
+ }
+
+ cb_exit(NULL, 0);
+}
+
+/*
+ * Misc
+ */
+
+static void
+cb_delay(void *arg, int usec)
+{
+
+ usleep(usec);
+}
+
+static void
+cb_exit(void *arg, int v)
+{
+
+ tcsetattr(0, TCSAFLUSH, &oldterm);
+ exit(v);
+}
+
+static void
+cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
+{
+
+ *ret_lowmem = lowmem;
+ *ret_highmem = highmem;
+}
+
+static const char *
+cb_getenv(void *arg, int num)
+{
+ int max;
+
+ static const char * var[] = {
+ "smbios.bios.vendor=BHYVE",
+ "boot_serial=1",
+ NULL
+ };
+
+ max = sizeof(var) / sizeof(var[0]);
+
+ if (num < max)
+ return (var[num]);
+ else
+ return (NULL);
+}
+
+static struct loader_callbacks cb = {
+ .getc = cb_getc,
+ .putc = cb_putc,
+ .poll = cb_poll,
+
+ .open = cb_open,
+ .close = cb_close,
+ .isdir = cb_isdir,
+ .read = cb_read,
+ .readdir = cb_readdir,
+ .seek = cb_seek,
+ .stat = cb_stat,
+
+ .diskread = cb_diskread,
+ .diskioctl = cb_diskioctl,
+
+ .copyin = cb_copyin,
+ .copyout = cb_copyout,
+ .setreg = cb_setreg,
+ .setmsr = cb_setmsr,
+ .setcr = cb_setcr,
+ .setgdt = cb_setgdt,
+ .exec = cb_exec,
+
+ .delay = cb_delay,
+ .exit = cb_exit,
+ .getmem = cb_getmem,
+
+ .getenv = cb_getenv,
+};
+
+static void
+usage(void)
+{
+
+ printf("usage: %s [-d <disk image path>] [-h <host filesystem path>] "
+ "[-m <lowmem>][-M <highmem>] "
+ "<vmname>\n", progname);
+ exit(1);
+}
+
+int
+main(int argc, char** argv)
+{
+ void *h;
+ void (*func)(struct loader_callbacks *, void *, int, int);
+ int opt, error;
+ char *disk_image;
+
+ progname = argv[0];
+
+ lowmem = 128 * MB;
+ highmem = 0;
+ disk_image = NULL;
+
+ while ((opt = getopt(argc, argv, "d:h:m:M:")) != -1) {
+ switch (opt) {
+ case 'd':
+ disk_image = optarg;
+ break;
+
+ case 'h':
+ host_base = optarg;
+ break;
+
+ case 'm':
+ lowmem = strtoul(optarg, NULL, 0) * MB;
+ break;
+
+ case 'M':
+ highmem = strtoul(optarg, NULL, 0) * MB;
+ break;
+
+ case '?':
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage();
+
+ vmname = argv[0];
+
+ error = vm_create(vmname);
+ if (error != 0 && errno != EEXIST) {
+ perror("vm_create");
+ exit(1);
+
+ }
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ error = vm_setup_memory(ctx, 0, lowmem, &membase);
+ if (error) {
+ perror("vm_setup_memory(lowmem)");
+ exit(1);
+ }
+
+ if (highmem != 0) {
+ error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+ if (error) {
+ perror("vm_setup_memory(highmem)");
+ exit(1);
+ }
+ }
+
+ tcgetattr(0, &term);
+ oldterm = term;
+ term.c_lflag &= ~(ICANON|ECHO);
+ term.c_iflag &= ~ICRNL;
+ tcsetattr(0, TCSAFLUSH, &term);
+ h = dlopen("/boot/userboot.so", RTLD_LOCAL);
+ if (!h) {
+ printf("%s\n", dlerror());
+ return (1);
+ }
+ func = dlsym(h, "loader_main");
+ if (!func) {
+ printf("%s\n", dlerror());
+ return (1);
+ }
+
+ if (disk_image) {
+ disk_fd = open(disk_image, O_RDONLY);
+ }
+ func(&cb, NULL, USERBOOT_VERSION_3, disk_fd >= 0);
+}
OpenPOWER on IntegriCloud